howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26import cyvcf2
   27import pyBigWig
   28import math
   29
   30from howard.functions.commons import *
   31from howard.objects.database import *
   32from howard.functions.databases import *
   33from howard.functions.utils import *
   34
   35
   36class Variants:
   37
   38    def __init__(
   39        self,
   40        conn=None,
   41        input: str = None,
   42        output: str = None,
   43        config: dict = {},
   44        param: dict = {},
   45        load: bool = False,
   46    ) -> None:
   47        """
   48        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   49        header
   50
   51        :param conn: the connection to the database
   52        :param input: the input file
   53        :param output: the output file
   54        :param config: a dictionary containing the configuration of the model
   55        :param param: a dictionary containing the parameters of the model
   56        """
   57
   58        # Init variables
   59        self.init_variables()
   60
   61        # Input
   62        self.set_input(input)
   63
   64        # Config
   65        self.set_config(config)
   66
   67        # Param
   68        self.set_param(param)
   69
   70        # Output
   71        self.set_output(output)
   72
   73        # connexion
   74        self.set_connexion(conn)
   75
   76        # Header
   77        self.set_header()
   78
   79        # Samples
   80        self.set_samples()
   81
   82        # Load data
   83        if load:
   84            self.load_data()
   85
   86    def set_samples(self, samples: list = None) -> list:
   87        """
   88        The function `set_samples` sets the samples attribute of an object to a provided list or
   89        retrieves it from a parameter dictionary.
   90
   91        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   92        input and sets the `samples` attribute of the class to the provided list. If no samples are
   93        provided, it tries to get the samples from the class's parameters using the `get_param` method
   94        :type samples: list
   95        :return: The `samples` list is being returned.
   96        """
   97
   98        if not samples:
   99            samples = self.get_param().get("samples", {}).get("list", None)
  100
  101        self.samples = samples
  102
  103        return samples
  104
  105    def get_samples(self) -> list:
  106        """
  107        This function returns a list of samples.
  108        :return: The `get_samples` method is returning the `samples` attribute of the object.
  109        """
  110
  111        return self.samples
  112
  113    def get_samples_check(self) -> bool:
  114        """
  115        This function returns the value of the "check" key within the "samples" dictionary retrieved
  116        from the parameters.
  117        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  118        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  119        method. If the key "check" is not found, it will return `False`.
  120        """
  121
  122        return self.get_param().get("samples", {}).get("check", True)
  123
  124    def set_input(self, input: str = None) -> None:
  125        """
  126        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  127        attributes in the class accordingly.
  128
  129        :param input: The `set_input` method in the provided code snippet is used to set attributes
  130        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  131        :type input: str
  132        """
  133
  134        if input and not isinstance(input, str):
  135            try:
  136                self.input = input.name
  137            except:
  138                log.error(f"Input file '{input} in bad format")
  139                raise ValueError(f"Input file '{input} in bad format")
  140        else:
  141            self.input = input
  142
  143        # Input format
  144        if input:
  145            input_name, input_extension = os.path.splitext(self.input)
  146            self.input_name = input_name
  147            self.input_extension = input_extension
  148            self.input_format = self.input_extension.replace(".", "")
  149
  150    def set_config(self, config: dict) -> None:
  151        """
  152        The set_config function takes a config object and assigns it as the configuration object for the
  153        class.
  154
  155        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  156        contains configuration settings for the class. When you call the `set_config` function with a
  157        dictionary object as the argument, it will set that dictionary as the configuration object for
  158        the class
  159        :type config: dict
  160        """
  161
  162        self.config = config
  163
  164    def set_param(self, param: dict) -> None:
  165        """
  166        This function sets a parameter object for the class based on the input dictionary.
  167
  168        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  169        as the `param` attribute of the class instance
  170        :type param: dict
  171        """
  172
  173        self.param = param
  174
  175    def init_variables(self) -> None:
  176        """
  177        This function initializes the variables that will be used in the rest of the class
  178        """
  179
  180        self.prefix = "howard"
  181        self.table_variants = "variants"
  182        self.dataframe = None
  183
  184        self.comparison_map = {
  185            "gt": ">",
  186            "gte": ">=",
  187            "lt": "<",
  188            "lte": "<=",
  189            "equals": "=",
  190            "contains": "SIMILAR TO",
  191        }
  192
  193        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  194
  195        self.code_type_map_to_sql = {
  196            "Integer": "INTEGER",
  197            "String": "VARCHAR",
  198            "Float": "FLOAT",
  199            "Flag": "VARCHAR",
  200        }
  201
  202        self.index_additionnal_fields = []
  203
  204    def get_indexing(self) -> bool:
  205        """
  206        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  207        returns False.
  208        :return: The value of the indexing parameter.
  209        """
  210
  211        return self.get_param().get("indexing", False)
  212
  213    def get_connexion_config(self) -> dict:
  214        """
  215        The function `get_connexion_config` returns a dictionary containing the configuration for a
  216        connection, including the number of threads and memory limit.
  217        :return: a dictionary containing the configuration for the Connexion library.
  218        """
  219
  220        # config
  221        config = self.get_config()
  222
  223        # Connexion config
  224        connexion_config = {}
  225        threads = self.get_threads()
  226
  227        # Threads
  228        if threads:
  229            connexion_config["threads"] = threads
  230
  231        # Memory
  232        # if config.get("memory", None):
  233        #     connexion_config["memory_limit"] = config.get("memory")
  234        if self.get_memory():
  235            connexion_config["memory_limit"] = self.get_memory()
  236
  237        # Temporary directory
  238        if config.get("tmp", None):
  239            connexion_config["temp_directory"] = config.get("tmp")
  240
  241        # Access
  242        if config.get("access", None):
  243            access = config.get("access")
  244            if access in ["RO"]:
  245                access = "READ_ONLY"
  246            elif access in ["RW"]:
  247                access = "READ_WRITE"
  248            connexion_db = self.get_connexion_db()
  249            if connexion_db in ":memory:":
  250                access = "READ_WRITE"
  251            connexion_config["access_mode"] = access
  252
  253        return connexion_config
  254
  255    def get_duckdb_settings(self) -> dict:
  256        """
  257        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  258        string.
  259        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  260        """
  261
  262        # config
  263        config = self.get_config()
  264
  265        # duckdb settings
  266        duckdb_settings_dict = {}
  267        if config.get("duckdb_settings", None):
  268            duckdb_settings = config.get("duckdb_settings")
  269            duckdb_settings = full_path(duckdb_settings)
  270            # duckdb setting is a file
  271            if os.path.exists(duckdb_settings):
  272                with open(duckdb_settings) as json_file:
  273                    duckdb_settings_dict = yaml.safe_load(json_file)
  274            # duckdb settings is a string
  275            else:
  276                duckdb_settings_dict = json.loads(duckdb_settings)
  277
  278        return duckdb_settings_dict
  279
  280    def set_connexion_db(self) -> str:
  281        """
  282        The function `set_connexion_db` returns the appropriate database connection string based on the
  283        input format and connection type.
  284        :return: the value of the variable `connexion_db`.
  285        """
  286
  287        # Default connexion db
  288        default_connexion_db = ":memory:"
  289
  290        # Find connexion db
  291        if self.get_input_format() in ["db", "duckdb"]:
  292            connexion_db = self.get_input()
  293        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  294            connexion_db = default_connexion_db
  295        elif self.get_connexion_type() in ["tmpfile"]:
  296            tmp_name = tempfile.mkdtemp(
  297                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  298            )
  299            connexion_db = f"{tmp_name}/tmp.db"
  300        elif self.get_connexion_type() != "":
  301            connexion_db = self.get_connexion_type()
  302        else:
  303            connexion_db = default_connexion_db
  304
  305        # Set connexion db
  306        self.connexion_db = connexion_db
  307
  308        return connexion_db
  309
  310    def set_connexion(self, conn) -> None:
  311        """
  312        The function `set_connexion` creates a connection to a database, with options for different
  313        database formats and settings.
  314
  315        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  316        database. If a connection is not provided, a new connection to an in-memory database is created.
  317        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  318        sqlite
  319        """
  320
  321        # Connexion db
  322        connexion_db = self.set_connexion_db()
  323
  324        # Connexion config
  325        connexion_config = self.get_connexion_config()
  326
  327        # Connexion format
  328        connexion_format = self.get_config().get("connexion_format", "duckdb")
  329        # Set connexion format
  330        self.connexion_format = connexion_format
  331
  332        # Connexion
  333        if not conn:
  334            if connexion_format in ["duckdb"]:
  335                conn = duckdb.connect(connexion_db, config=connexion_config)
  336                # duckDB settings
  337                duckdb_settings = self.get_duckdb_settings()
  338                if duckdb_settings:
  339                    for setting in duckdb_settings:
  340                        setting_value = duckdb_settings.get(setting)
  341                        if isinstance(setting_value, str):
  342                            setting_value = f"'{setting_value}'"
  343                        conn.execute(f"PRAGMA {setting}={setting_value};")
  344            elif connexion_format in ["sqlite"]:
  345                conn = sqlite3.connect(connexion_db)
  346
  347        # Set connexion
  348        self.conn = conn
  349
  350        # Log
  351        log.debug(f"connexion_format: {connexion_format}")
  352        log.debug(f"connexion_db: {connexion_db}")
  353        log.debug(f"connexion config: {connexion_config}")
  354        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  355
  356    def set_output(self, output: str = None) -> None:
  357        """
  358        The `set_output` function in Python sets the output file based on the input or a specified key
  359        in the config file, extracting the output name, extension, and format.
  360
  361        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  362        the output file. If the config file has an 'output' key, the method sets the output to the value
  363        of that key. If no output is provided, it sets the output to `None`
  364        :type output: str
  365        """
  366
  367        if output and not isinstance(output, str):
  368            self.output = output.name
  369        else:
  370            self.output = output
  371
  372        # Output format
  373        if self.output:
  374            output_name, output_extension = os.path.splitext(self.output)
  375            self.output_name = output_name
  376            self.output_extension = output_extension
  377            self.output_format = self.output_extension.replace(".", "")
  378        else:
  379            self.output_name = None
  380            self.output_extension = None
  381            self.output_format = None
  382
  383    def set_header(self) -> None:
  384        """
  385        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  386        """
  387
  388        input_file = self.get_input()
  389        default_header_list = [
  390            "##fileformat=VCFv4.2",
  391            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  392        ]
  393
  394        # Full path
  395        input_file = full_path(input_file)
  396
  397        if input_file:
  398
  399            input_format = self.get_input_format()
  400            input_compressed = self.get_input_compressed()
  401            config = self.get_config()
  402            header_list = default_header_list
  403            if input_format in [
  404                "vcf",
  405                "hdr",
  406                "tsv",
  407                "csv",
  408                "psv",
  409                "parquet",
  410                "db",
  411                "duckdb",
  412            ]:
  413                # header provided in param
  414                if config.get("header_file", None):
  415                    with open(config.get("header_file"), "rt") as f:
  416                        header_list = self.read_vcf_header(f)
  417                # within a vcf file format (header within input file itsself)
  418                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  419                    # within a compressed vcf file format (.vcf.gz)
  420                    if input_compressed:
  421                        with bgzf.open(input_file, "rt") as f:
  422                            header_list = self.read_vcf_header(f)
  423                    # within an uncompressed vcf file format (.vcf)
  424                    else:
  425                        with open(input_file, "rt") as f:
  426                            header_list = self.read_vcf_header(f)
  427                # header provided in default external file .hdr
  428                elif os.path.exists((input_file + ".hdr")):
  429                    with open(input_file + ".hdr", "rt") as f:
  430                        header_list = self.read_vcf_header(f)
  431                else:
  432                    try:  # Try to get header info fields and file columns
  433
  434                        with tempfile.TemporaryDirectory() as tmpdir:
  435
  436                            # Create database
  437                            db_for_header = Database(database=input_file)
  438
  439                            # Get header columns for infos fields
  440                            db_header_from_columns = (
  441                                db_for_header.get_header_from_columns()
  442                            )
  443
  444                            # Get real columns in the file
  445                            db_header_columns = db_for_header.get_columns()
  446
  447                            # Write header file
  448                            header_file_tmp = os.path.join(tmpdir, "header")
  449                            f = open(header_file_tmp, "w")
  450                            vcf.Writer(f, db_header_from_columns)
  451                            f.close()
  452
  453                            # Replace #CHROM line with rel columns
  454                            header_list = db_for_header.read_header_file(
  455                                header_file=header_file_tmp
  456                            )
  457                            header_list[-1] = "\t".join(db_header_columns)
  458
  459                    except:
  460
  461                        log.warning(
  462                            f"No header for file {input_file}. Set as default VCF header"
  463                        )
  464                        header_list = default_header_list
  465
  466            else:  # try for unknown format ?
  467
  468                log.error(f"Input file format '{input_format}' not available")
  469                raise ValueError(f"Input file format '{input_format}' not available")
  470
  471            if not header_list:
  472                header_list = default_header_list
  473
  474            # header as list
  475            self.header_list = header_list
  476
  477            # header as VCF object
  478            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  479
  480        else:
  481
  482            self.header_list = None
  483            self.header_vcf = None
  484
  485    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  486        """
  487        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  488        DataFrame based on the connection format.
  489
  490        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  491        represents the SQL query you want to execute. This query will be used to fetch data from a
  492        database and convert it into a pandas DataFrame
  493        :type query: str
  494        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  495        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  496        function will only fetch up to that number of rows from the database query result. If no limit
  497        is specified,
  498        :type limit: int
  499        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  500        """
  501
  502        # Connexion format
  503        connexion_format = self.get_connexion_format()
  504
  505        # Limit in query
  506        if limit:
  507            pd.set_option("display.max_rows", limit)
  508            if connexion_format in ["duckdb"]:
  509                df = (
  510                    self.conn.execute(query)
  511                    .fetch_record_batch(limit)
  512                    .read_next_batch()
  513                    .to_pandas()
  514                )
  515            elif connexion_format in ["sqlite"]:
  516                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  517
  518        # Full query
  519        else:
  520            if connexion_format in ["duckdb"]:
  521                df = self.conn.execute(query).df()
  522            elif connexion_format in ["sqlite"]:
  523                df = pd.read_sql_query(query, self.conn)
  524
  525        return df
  526
  527    def get_overview(self) -> None:
  528        """
  529        The function prints the input, output, config, and dataframe of the current object
  530        """
  531        table_variants_from = self.get_table_variants(clause="from")
  532        sql_columns = self.get_header_columns_as_sql()
  533        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  534        df = self.get_query_to_df(sql_query_export)
  535        log.info(
  536            "Input:  "
  537            + str(self.get_input())
  538            + " ["
  539            + str(str(self.get_input_format()))
  540            + "]"
  541        )
  542        log.info(
  543            "Output: "
  544            + str(self.get_output())
  545            + " ["
  546            + str(str(self.get_output_format()))
  547            + "]"
  548        )
  549        log.info("Config: ")
  550        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  551            "\n"
  552        ):
  553            log.info("\t" + str(d))
  554        log.info("Param: ")
  555        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  556            "\n"
  557        ):
  558            log.info("\t" + str(d))
  559        log.info("Sample list: " + str(self.get_header_sample_list()))
  560        log.info("Dataframe: ")
  561        for d in str(df).split("\n"):
  562            log.info("\t" + str(d))
  563
  564        # garbage collector
  565        del df
  566        gc.collect()
  567
  568        return None
  569
  570    def get_stats(self) -> dict:
  571        """
  572        The `get_stats` function calculates and returns various statistics of the current object,
  573        including information about the input file, variants, samples, header fields, quality, and
  574        SNVs/InDels.
  575        :return: a dictionary containing various statistics of the current object. The dictionary has
  576        the following structure:
  577        """
  578
  579        # Log
  580        log.info(f"Stats Calculation...")
  581
  582        # table varaints
  583        table_variants_from = self.get_table_variants()
  584
  585        # stats dict
  586        stats = {"Infos": {}}
  587
  588        ### File
  589        input_file = self.get_input()
  590        stats["Infos"]["Input file"] = input_file
  591
  592        # Header
  593        header_infos = self.get_header().infos
  594        header_formats = self.get_header().formats
  595        header_infos_list = list(header_infos)
  596        header_formats_list = list(header_formats)
  597
  598        ### Variants
  599
  600        stats["Variants"] = {}
  601
  602        # Variants by chr
  603        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  604        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  605        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  606            by=["CHROM"], kind="quicksort"
  607        )
  608
  609        # Total number of variants
  610        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  611
  612        # Calculate percentage
  613        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  614            lambda x: (x / nb_of_variants)
  615        )
  616
  617        stats["Variants"]["Number of variants by chromosome"] = (
  618            nb_of_variants_by_chrom.to_dict(orient="index")
  619        )
  620
  621        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  622
  623        ### Samples
  624
  625        # Init
  626        samples = {}
  627        nb_of_samples = 0
  628
  629        # Check Samples
  630        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  631            log.debug(f"Check samples...")
  632            for sample in self.get_header_sample_list():
  633                sql_query_samples = f"""
  634                    SELECT  '{sample}' as sample,
  635                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  636                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  637                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  638                    FROM {table_variants_from}
  639                    WHERE (
  640                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  641                        AND
  642                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  643                      )
  644                    GROUP BY genotype
  645                    """
  646                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  647                sample_genotype_count = sql_query_genotype_df["count"].sum()
  648                if len(sql_query_genotype_df):
  649                    nb_of_samples += 1
  650                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  651                        sql_query_genotype_df.to_dict(orient="index")
  652                    )
  653
  654            stats["Samples"] = samples
  655            stats["Infos"]["Number of samples"] = nb_of_samples
  656
  657        # #
  658        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  659        #     stats["Infos"]["Number of samples"] = nb_of_samples
  660        # elif nb_of_samples:
  661        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  662
  663        ### INFO and FORMAT fields
  664        header_types_df = {}
  665        header_types_list = {
  666            "List of INFO fields": header_infos,
  667            "List of FORMAT fields": header_formats,
  668        }
  669        i = 0
  670        for header_type in header_types_list:
  671
  672            header_type_infos = header_types_list.get(header_type)
  673            header_infos_dict = {}
  674
  675            for info in header_type_infos:
  676
  677                i += 1
  678                header_infos_dict[i] = {}
  679
  680                # ID
  681                header_infos_dict[i]["id"] = info
  682
  683                # num
  684                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  685                if header_type_infos[info].num in genotype_map.keys():
  686                    header_infos_dict[i]["Number"] = genotype_map.get(
  687                        header_type_infos[info].num
  688                    )
  689                else:
  690                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  691
  692                # type
  693                if header_type_infos[info].type:
  694                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  695                else:
  696                    header_infos_dict[i]["Type"] = "."
  697
  698                # desc
  699                if header_type_infos[info].desc != None:
  700                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  701                else:
  702                    header_infos_dict[i]["Description"] = ""
  703
  704            if len(header_infos_dict):
  705                header_types_df[header_type] = pd.DataFrame.from_dict(
  706                    header_infos_dict, orient="index"
  707                ).to_dict(orient="index")
  708
  709        # Stats
  710        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  711        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  712        stats["Header"] = header_types_df
  713
  714        ### QUAL
  715        if "QUAL" in self.get_header_columns():
  716            sql_query_qual = f"""
  717                    SELECT
  718                        avg(CAST(QUAL AS INTEGER)) AS Average,
  719                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  720                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  721                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  722                        median(CAST(QUAL AS INTEGER)) AS Median,
  723                        variance(CAST(QUAL AS INTEGER)) AS Variance
  724                    FROM {table_variants_from}
  725                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  726                    """
  727
  728            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  729            stats["Quality"] = {"Stats": qual}
  730
  731        ### SNV and InDel
  732
  733        sql_query_snv = f"""
  734            
  735            SELECT Type, count FROM (
  736
  737                    SELECT
  738                        'Total' AS Type,
  739                        count(*) AS count
  740                    FROM {table_variants_from}
  741
  742                    UNION
  743
  744                    SELECT
  745                        'MNV' AS Type,
  746                        count(*) AS count
  747                    FROM {table_variants_from}
  748                    WHERE len(REF) > 1 AND len(ALT) > 1
  749                    AND len(REF) = len(ALT)
  750
  751                    UNION
  752
  753                    SELECT
  754                        'InDel' AS Type,
  755                        count(*) AS count
  756                    FROM {table_variants_from}
  757                    WHERE len(REF) > 1 OR len(ALT) > 1
  758                    AND len(REF) != len(ALT)
  759                    
  760                    UNION
  761
  762                    SELECT
  763                        'SNV' AS Type,
  764                        count(*) AS count
  765                    FROM {table_variants_from}
  766                    WHERE len(REF) = 1 AND len(ALT) = 1
  767
  768                )
  769
  770            ORDER BY count DESC
  771
  772                """
  773        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  774
  775        sql_query_snv_substitution = f"""
  776                SELECT
  777                    concat(REF, '>', ALT) AS 'Substitution',
  778                    count(*) AS count
  779                FROM {table_variants_from}
  780                WHERE len(REF) = 1 AND len(ALT) = 1
  781                GROUP BY REF, ALT
  782                ORDER BY count(*) DESC
  783                """
  784        snv_substitution = (
  785            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  786        )
  787        stats["Variants"]["Counts"] = snv_indel
  788        stats["Variants"]["Substitutions"] = snv_substitution
  789
  790        return stats
  791
  792    def stats_to_file(self, file: str = None) -> str:
  793        """
  794        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  795        into a JSON object, and writes the JSON object to the specified file.
  796
  797        :param file: The `file` parameter is a string that represents the file path where the JSON data
  798        will be written
  799        :type file: str
  800        :return: the name of the file that was written to.
  801        """
  802
  803        # Get stats
  804        stats = self.get_stats()
  805
  806        # Serializing json
  807        json_object = json.dumps(stats, indent=4)
  808
  809        # Writing to sample.json
  810        with open(file, "w") as outfile:
  811            outfile.write(json_object)
  812
  813        return file
  814
  815    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  816        """
  817        The `print_stats` function generates a markdown file and prints the statistics contained in a
  818        JSON file in a formatted manner.
  819
  820        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  821        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  822        provided, a temporary directory will be created and the stats will be saved in a file named
  823        "stats.md" within that
  824        :type output_file: str
  825        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  826        file where the statistics will be saved. If no value is provided, a temporary directory will be
  827        created and a default file name "stats.json" will be used
  828        :type json_file: str
  829        :return: The function `print_stats` does not return any value. It has a return type annotation
  830        of `None`.
  831        """
  832
  833        # Full path
  834        output_file = full_path(output_file)
  835        json_file = full_path(json_file)
  836
  837        with tempfile.TemporaryDirectory() as tmpdir:
  838
  839            # Files
  840            if not output_file:
  841                output_file = os.path.join(tmpdir, "stats.md")
  842            if not json_file:
  843                json_file = os.path.join(tmpdir, "stats.json")
  844
  845            # Create folders
  846            if not os.path.exists(os.path.dirname(output_file)):
  847                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  848            if not os.path.exists(os.path.dirname(json_file)):
  849                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  850
  851            # Create stats JSON file
  852            stats_file = self.stats_to_file(file=json_file)
  853
  854            # Print stats file
  855            with open(stats_file) as f:
  856                stats = yaml.safe_load(f)
  857
  858            # Output
  859            output_title = []
  860            output_index = []
  861            output = []
  862
  863            # Title
  864            output_title.append("# HOWARD Stats")
  865
  866            # Index
  867            output_index.append("## Index")
  868
  869            # Process sections
  870            for section in stats:
  871                infos = stats.get(section)
  872                section_link = "#" + section.lower().replace(" ", "-")
  873                output.append(f"## {section}")
  874                output_index.append(f"- [{section}]({section_link})")
  875
  876                if len(infos):
  877                    for info in infos:
  878                        try:
  879                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  880                            is_df = True
  881                        except:
  882                            try:
  883                                df = pd.DataFrame.from_dict(
  884                                    json.loads((infos.get(info))), orient="index"
  885                                )
  886                                is_df = True
  887                            except:
  888                                is_df = False
  889                        if is_df:
  890                            output.append(f"### {info}")
  891                            info_link = "#" + info.lower().replace(" ", "-")
  892                            output_index.append(f"   - [{info}]({info_link})")
  893                            output.append(f"{df.to_markdown(index=False)}")
  894                        else:
  895                            output.append(f"- {info}: {infos.get(info)}")
  896                else:
  897                    output.append(f"NA")
  898
  899            # Write stats in markdown file
  900            with open(output_file, "w") as fp:
  901                for item in output_title:
  902                    fp.write("%s\n" % item)
  903                for item in output_index:
  904                    fp.write("%s\n" % item)
  905                for item in output:
  906                    fp.write("%s\n" % item)
  907
  908            # Output stats in markdown
  909            print("")
  910            print("\n\n".join(output_title))
  911            print("")
  912            print("\n\n".join(output))
  913            print("")
  914
  915        return None
  916
  917    def get_input(self) -> str:
  918        """
  919        It returns the value of the input variable.
  920        :return: The input is being returned.
  921        """
  922        return self.input
  923
  924    def get_input_format(self, input_file: str = None) -> str:
  925        """
  926        This function returns the format of the input variable, either from the provided input file or
  927        by prompting for input.
  928
  929        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  930        represents the file path of the input file. If no `input_file` is provided when calling the
  931        method, it will default to `None`
  932        :type input_file: str
  933        :return: The format of the input variable is being returned.
  934        """
  935
  936        if not input_file:
  937            input_file = self.get_input()
  938        input_format = get_file_format(input_file)
  939        return input_format
  940
  941    def get_input_compressed(self, input_file: str = None) -> str:
  942        """
  943        The function `get_input_compressed` returns the format of the input variable after compressing
  944        it.
  945
  946        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  947        that represents the file path of the input file. If no `input_file` is provided when calling the
  948        method, it will default to `None` and the method will then call `self.get_input()` to
  949        :type input_file: str
  950        :return: The function `get_input_compressed` returns the compressed format of the input
  951        variable.
  952        """
  953
  954        if not input_file:
  955            input_file = self.get_input()
  956        input_compressed = get_file_compressed(input_file)
  957        return input_compressed
  958
  959    def get_output(self) -> str:
  960        """
  961        It returns the output of the neuron.
  962        :return: The output of the neural network.
  963        """
  964
  965        return self.output
  966
  967    def get_output_format(self, output_file: str = None) -> str:
  968        """
  969        The function `get_output_format` returns the format of the input variable or the output file if
  970        provided.
  971
  972        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  973        that represents the file path of the output file. If no `output_file` is provided when calling
  974        the method, it will default to the output obtained from the `get_output` method of the class
  975        instance. The
  976        :type output_file: str
  977        :return: The format of the input variable is being returned.
  978        """
  979
  980        if not output_file:
  981            output_file = self.get_output()
  982        output_format = get_file_format(output_file)
  983
  984        return output_format
  985
  986    def get_config(self) -> dict:
  987        """
  988        It returns the config
  989        :return: The config variable is being returned.
  990        """
  991        return self.config
  992
  993    def get_param(self) -> dict:
  994        """
  995        It returns the param
  996        :return: The param variable is being returned.
  997        """
  998        return self.param
  999
 1000    def get_connexion_db(self) -> str:
 1001        """
 1002        It returns the connexion_db attribute of the object
 1003        :return: The connexion_db is being returned.
 1004        """
 1005        return self.connexion_db
 1006
 1007    def get_prefix(self) -> str:
 1008        """
 1009        It returns the prefix of the object.
 1010        :return: The prefix is being returned.
 1011        """
 1012        return self.prefix
 1013
 1014    def get_table_variants(self, clause: str = "select") -> str:
 1015        """
 1016        This function returns the table_variants attribute of the object
 1017
 1018        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1019        defaults to select (optional)
 1020        :return: The table_variants attribute of the object.
 1021        """
 1022
 1023        # Access
 1024        access = self.get_config().get("access", None)
 1025
 1026        # Clauses "select", "where", "update"
 1027        if clause in ["select", "where", "update"]:
 1028            table_variants = self.table_variants
 1029        # Clause "from"
 1030        elif clause in ["from"]:
 1031            # For Read Only
 1032            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1033                input_file = self.get_input()
 1034                table_variants = f"'{input_file}' as variants"
 1035            # For Read Write
 1036            else:
 1037                table_variants = f"{self.table_variants} as variants"
 1038        else:
 1039            table_variants = self.table_variants
 1040        return table_variants
 1041
 1042    def get_tmp_dir(self) -> str:
 1043        """
 1044        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1045        parameters or a default path.
 1046        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1047        configuration, parameters, and a default value of "/tmp".
 1048        """
 1049
 1050        return get_tmp(
 1051            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1052        )
 1053
 1054    def get_connexion_type(self) -> str:
 1055        """
 1056        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1057
 1058        :return: The connexion type is being returned.
 1059        """
 1060        return self.get_config().get("connexion_type", "memory")
 1061
 1062    def get_connexion(self):
 1063        """
 1064        It returns the connection object
 1065
 1066        :return: The connection object.
 1067        """
 1068        return self.conn
 1069
 1070    def close_connexion(self) -> None:
 1071        """
 1072        This function closes the connection to the database.
 1073        :return: The connection is being closed.
 1074        """
 1075        return self.conn.close()
 1076
 1077    def get_header(self, type: str = "vcf"):
 1078        """
 1079        This function returns the header of the VCF file as a list of strings
 1080
 1081        :param type: the type of header you want to get, defaults to vcf (optional)
 1082        :return: The header of the vcf file.
 1083        """
 1084
 1085        if self.header_vcf:
 1086            if type == "vcf":
 1087                return self.header_vcf
 1088            elif type == "list":
 1089                return self.header_list
 1090        else:
 1091            if type == "vcf":
 1092                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1093                return header
 1094            elif type == "list":
 1095                return vcf_required
 1096
 1097    def get_header_infos_list(self) -> list:
 1098        """
 1099        This function retrieves a list of information fields from the header.
 1100        :return: A list of information fields from the header.
 1101        """
 1102
 1103        # Init
 1104        infos_list = []
 1105
 1106        for field in self.get_header().infos:
 1107            infos_list.append(field)
 1108
 1109        return infos_list
 1110
 1111    def get_header_length(self, file: str = None) -> int:
 1112        """
 1113        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1114        line.
 1115
 1116        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1117        header file. If this argument is provided, the function will read the header from the specified
 1118        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1119        :type file: str
 1120        :return: the length of the header list, excluding the #CHROM line.
 1121        """
 1122
 1123        if file:
 1124            return len(self.read_vcf_header_file(file=file)) - 1
 1125        elif self.get_header(type="list"):
 1126            return len(self.get_header(type="list")) - 1
 1127        else:
 1128            return 0
 1129
 1130    def get_header_columns(self) -> str:
 1131        """
 1132        This function returns the header list of a VCF
 1133
 1134        :return: The length of the header list.
 1135        """
 1136        if self.get_header():
 1137            return self.get_header(type="list")[-1]
 1138        else:
 1139            return ""
 1140
 1141    def get_header_columns_as_list(self) -> list:
 1142        """
 1143        This function returns the header list of a VCF
 1144
 1145        :return: The length of the header list.
 1146        """
 1147        if self.get_header():
 1148            return self.get_header_columns().strip().split("\t")
 1149        else:
 1150            return []
 1151
 1152    def get_header_columns_as_sql(self) -> str:
 1153        """
 1154        This function retruns header length (without #CHROM line)
 1155
 1156        :return: The length of the header list.
 1157        """
 1158        sql_column_list = []
 1159        for col in self.get_header_columns_as_list():
 1160            sql_column_list.append(f'"{col}"')
 1161        return ",".join(sql_column_list)
 1162
 1163    def get_header_sample_list(
 1164        self, check: bool = False, samples: list = None, samples_force: bool = False
 1165    ) -> list:
 1166        """
 1167        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1168        checking and filtering based on input parameters.
 1169
 1170        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1171        parameter that determines whether to check if the samples in the list are properly defined as
 1172        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1173        list is defined as a, defaults to False
 1174        :type check: bool (optional)
 1175        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1176        allows you to specify a subset of samples from the header. If you provide a list of sample
 1177        names, the function will check if each sample is defined in the header. If a sample is not found
 1178        in the
 1179        :type samples: list
 1180        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1181        a boolean parameter that determines whether to force the function to return the sample list
 1182        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1183        function will return the sample list without performing, defaults to False
 1184        :type samples_force: bool (optional)
 1185        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1186        parameters and conditions specified in the function.
 1187        """
 1188
 1189        # Init
 1190        samples_list = []
 1191
 1192        if samples is None:
 1193            samples_list = self.header_vcf.samples
 1194        else:
 1195            samples_checked = []
 1196            for sample in samples:
 1197                if sample in self.header_vcf.samples:
 1198                    samples_checked.append(sample)
 1199                else:
 1200                    log.warning(f"Sample '{sample}' not defined in header")
 1201            samples_list = samples_checked
 1202
 1203            # Force sample list without checking if is_genotype_column
 1204            if samples_force:
 1205                log.warning(f"Samples {samples_list} not checked if genotypes")
 1206                return samples_list
 1207
 1208        if check:
 1209            samples_checked = []
 1210            for sample in samples_list:
 1211                if self.is_genotype_column(column=sample):
 1212                    samples_checked.append(sample)
 1213                else:
 1214                    log.warning(
 1215                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1216                    )
 1217            samples_list = samples_checked
 1218
 1219        # Return samples list
 1220        return samples_list
 1221
 1222    def is_genotype_column(self, column: str = None) -> bool:
 1223        """
 1224        This function checks if a given column is a genotype column in a database.
 1225
 1226        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1227        represents the column name in a database table. This method checks if the specified column is a
 1228        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1229        method of
 1230        :type column: str
 1231        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1232        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1233        column name and returns the result. If the `column` parameter is None, it returns False.
 1234        """
 1235
 1236        if column is not None:
 1237            return Database(database=self.get_input()).is_genotype_column(column=column)
 1238        else:
 1239            return False
 1240
 1241    def get_verbose(self) -> bool:
 1242        """
 1243        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1244        exist
 1245
 1246        :return: The value of the key "verbose" in the config dictionary.
 1247        """
 1248        return self.get_config().get("verbose", False)
 1249
 1250    def get_connexion_format(self) -> str:
 1251        """
 1252        It returns the connexion format of the object.
 1253        :return: The connexion_format is being returned.
 1254        """
 1255        connexion_format = self.connexion_format
 1256        if connexion_format not in ["duckdb", "sqlite"]:
 1257            log.error(f"Unknown connexion format {connexion_format}")
 1258            raise ValueError(f"Unknown connexion format {connexion_format}")
 1259        else:
 1260            return connexion_format
 1261
 1262    def insert_file_to_table(
 1263        self,
 1264        file,
 1265        columns: str,
 1266        header_len: int = 0,
 1267        sep: str = "\t",
 1268        chunksize: int = 1000000,
 1269    ) -> None:
 1270        """
 1271        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1272        database format.
 1273
 1274        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1275        the path to the file on your system
 1276        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1277        should contain the names of the columns in the table where the data will be inserted. The column
 1278        names should be separated by commas within the string. For example, if you have columns named
 1279        "id", "name
 1280        :type columns: str
 1281        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1282        the number of lines to skip at the beginning of the file before reading the actual data. This
 1283        parameter allows you to skip any header information present in the file before processing the
 1284        data, defaults to 0
 1285        :type header_len: int (optional)
 1286        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1287        separator character that is used in the file being read. In this case, the default separator is
 1288        set to `\t`, which represents a tab character. You can change this parameter to a different
 1289        separator character if, defaults to \t
 1290        :type sep: str (optional)
 1291        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1292        when processing the file in chunks. In the provided code snippet, the default value for
 1293        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1294        to 1000000
 1295        :type chunksize: int (optional)
 1296        """
 1297
 1298        # Config
 1299        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1300        connexion_format = self.get_connexion_format()
 1301
 1302        log.debug("chunksize: " + str(chunksize))
 1303
 1304        if chunksize:
 1305            for chunk in pd.read_csv(
 1306                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1307            ):
 1308                if connexion_format in ["duckdb"]:
 1309                    sql_insert_into = (
 1310                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1311                    )
 1312                    self.conn.execute(sql_insert_into)
 1313                elif connexion_format in ["sqlite"]:
 1314                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1315
 1316    def load_data(
 1317        self,
 1318        input_file: str = None,
 1319        drop_variants_table: bool = False,
 1320        sample_size: int = 20480,
 1321    ) -> None:
 1322        """
 1323        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1324        table before loading the data and specify a sample size.
 1325
 1326        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1327        table
 1328        :type input_file: str
 1329        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1330        determines whether the variants table should be dropped before loading the data. If set to
 1331        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1332        not be dropped, defaults to False
 1333        :type drop_variants_table: bool (optional)
 1334        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1335        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1336        20480
 1337        :type sample_size: int (optional)
 1338        """
 1339
 1340        log.info("Loading...")
 1341
 1342        # change input file
 1343        if input_file:
 1344            self.set_input(input_file)
 1345            self.set_header()
 1346
 1347        # drop variants table
 1348        if drop_variants_table:
 1349            self.drop_variants_table()
 1350
 1351        # get table variants
 1352        table_variants = self.get_table_variants()
 1353
 1354        # Access
 1355        access = self.get_config().get("access", None)
 1356        log.debug(f"access: {access}")
 1357
 1358        # Input format and compress
 1359        input_format = self.get_input_format()
 1360        input_compressed = self.get_input_compressed()
 1361        log.debug(f"input_format: {input_format}")
 1362        log.debug(f"input_compressed: {input_compressed}")
 1363
 1364        # input_compressed_format
 1365        if input_compressed:
 1366            input_compressed_format = "gzip"
 1367        else:
 1368            input_compressed_format = "none"
 1369        log.debug(f"input_compressed_format: {input_compressed_format}")
 1370
 1371        # Connexion format
 1372        connexion_format = self.get_connexion_format()
 1373
 1374        # Sample size
 1375        if not sample_size:
 1376            sample_size = -1
 1377        log.debug(f"sample_size: {sample_size}")
 1378
 1379        # Load data
 1380        log.debug(f"Load Data from {input_format}")
 1381
 1382        # DuckDB connexion
 1383        if connexion_format in ["duckdb"]:
 1384
 1385            # Database already exists
 1386            if self.input_format in ["db", "duckdb"]:
 1387
 1388                if connexion_format in ["duckdb"]:
 1389                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1390                else:
 1391                    log.error(
 1392                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1393                    )
 1394                    raise ValueError(
 1395                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1396                    )
 1397
 1398            # Load from existing database format
 1399            else:
 1400
 1401                try:
 1402                    # Create Table or View
 1403                    database = Database(database=self.input)
 1404                    sql_from = database.get_sql_from(sample_size=sample_size)
 1405
 1406                    if access in ["RO"]:
 1407                        sql_load = (
 1408                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1409                        )
 1410                    else:
 1411                        sql_load = (
 1412                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1413                        )
 1414                    self.conn.execute(sql_load)
 1415
 1416                except:
 1417                    # Format not available
 1418                    log.error(f"Input file format '{self.input_format}' not available")
 1419                    raise ValueError(
 1420                        f"Input file format '{self.input_format}' not available"
 1421                    )
 1422
 1423        # SQLite connexion
 1424        elif connexion_format in ["sqlite"] and input_format in [
 1425            "vcf",
 1426            "tsv",
 1427            "csv",
 1428            "psv",
 1429        ]:
 1430
 1431            # Main structure
 1432            structure = {
 1433                "#CHROM": "VARCHAR",
 1434                "POS": "INTEGER",
 1435                "ID": "VARCHAR",
 1436                "REF": "VARCHAR",
 1437                "ALT": "VARCHAR",
 1438                "QUAL": "VARCHAR",
 1439                "FILTER": "VARCHAR",
 1440                "INFO": "VARCHAR",
 1441            }
 1442
 1443            # Strcuture with samples
 1444            structure_complete = structure
 1445            if self.get_header_sample_list():
 1446                structure["FORMAT"] = "VARCHAR"
 1447                for sample in self.get_header_sample_list():
 1448                    structure_complete[sample] = "VARCHAR"
 1449
 1450            # Columns list for create and insert
 1451            sql_create_table_columns = []
 1452            sql_create_table_columns_list = []
 1453            for column in structure_complete:
 1454                column_type = structure_complete[column]
 1455                sql_create_table_columns.append(
 1456                    f'"{column}" {column_type} default NULL'
 1457                )
 1458                sql_create_table_columns_list.append(f'"{column}"')
 1459
 1460            # Create database
 1461            log.debug(f"Create Table {table_variants}")
 1462            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1463            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1464            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1465            self.conn.execute(sql_create_table)
 1466
 1467            # chunksize define length of file chunk load file
 1468            chunksize = 100000
 1469
 1470            # delimiter
 1471            delimiter = file_format_delimiters.get(input_format, "\t")
 1472
 1473            # Load the input file
 1474            with open(self.input, "rt") as input_file:
 1475
 1476                # Use the appropriate file handler based on the input format
 1477                if input_compressed:
 1478                    input_file = bgzf.open(self.input, "rt")
 1479                if input_format in ["vcf"]:
 1480                    header_len = self.get_header_length()
 1481                else:
 1482                    header_len = 0
 1483
 1484                # Insert the file contents into a table
 1485                self.insert_file_to_table(
 1486                    input_file,
 1487                    columns=sql_create_table_columns_list_sql,
 1488                    header_len=header_len,
 1489                    sep=delimiter,
 1490                    chunksize=chunksize,
 1491                )
 1492
 1493        else:
 1494            log.error(
 1495                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1496            )
 1497            raise ValueError(
 1498                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1499            )
 1500
 1501        # Explode INFOS fields into table fields
 1502        if self.get_explode_infos():
 1503            self.explode_infos(
 1504                prefix=self.get_explode_infos_prefix(),
 1505                fields=self.get_explode_infos_fields(),
 1506                force=True,
 1507            )
 1508
 1509        # Create index after insertion
 1510        self.create_indexes()
 1511
 1512    def get_explode_infos(self) -> bool:
 1513        """
 1514        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1515        to False if it is not set.
 1516        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1517        value. If the parameter is not present, it will return False.
 1518        """
 1519
 1520        return self.get_param().get("explode", {}).get("explode_infos", False)
 1521
 1522    def get_explode_infos_fields(
 1523        self,
 1524        explode_infos_fields: str = None,
 1525        remove_fields_not_in_header: bool = False,
 1526    ) -> list:
 1527        """
 1528        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1529        the input parameter `explode_infos_fields`.
 1530
 1531        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1532        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1533        comma-separated list of field names to explode
 1534        :type explode_infos_fields: str
 1535        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1536        flag that determines whether to remove fields that are not present in the header. If it is set
 1537        to `True`, any field that is not in the header will be excluded from the list of exploded
 1538        information fields. If it is set to `, defaults to False
 1539        :type remove_fields_not_in_header: bool (optional)
 1540        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1541        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1542        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1543        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1544        splitting the string by commas.
 1545        """
 1546
 1547        # If no fields, get it in param
 1548        if not explode_infos_fields:
 1549            explode_infos_fields = (
 1550                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1551            )
 1552
 1553        # If no fields, defined as all fields in header using keyword
 1554        if not explode_infos_fields:
 1555            explode_infos_fields = "*"
 1556
 1557        # If fields list not empty
 1558        if explode_infos_fields:
 1559
 1560            # Input fields list
 1561            if isinstance(explode_infos_fields, str):
 1562                fields_input = explode_infos_fields.split(",")
 1563            elif isinstance(explode_infos_fields, list):
 1564                fields_input = explode_infos_fields
 1565            else:
 1566                fields_input = []
 1567
 1568            # Fields list without * keyword
 1569            fields_without_all = fields_input.copy()
 1570            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1571                fields_without_all.remove("*")
 1572
 1573            # Fields in header
 1574            fields_in_header = sorted(list(set(self.get_header().infos)))
 1575
 1576            # Construct list of fields
 1577            fields_output = []
 1578            for field in fields_input:
 1579
 1580                # Strip field
 1581                field = field.strip()
 1582
 1583                # format keyword * in regex
 1584                if field.upper() in ["*"]:
 1585                    field = ".*"
 1586
 1587                # Find all fields with pattern
 1588                r = re.compile(field)
 1589                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1590
 1591                # Remove fields input from search
 1592                if field in fields_search:
 1593                    fields_search = [field]
 1594                elif fields_search != [field]:
 1595                    fields_search = sorted(
 1596                        list(set(fields_search).difference(fields_input))
 1597                    )
 1598
 1599                # If field is not in header (avoid not well formatted header)
 1600                if not fields_search and not remove_fields_not_in_header:
 1601                    fields_search = [field]
 1602
 1603                # Add found fields
 1604                for new_field in fields_search:
 1605                    # Add field, if not already exists, and if it is in header (if asked)
 1606                    if (
 1607                        new_field not in fields_output
 1608                        and (
 1609                            not remove_fields_not_in_header
 1610                            or new_field in fields_in_header
 1611                        )
 1612                        and new_field not in [".*"]
 1613                    ):
 1614                        fields_output.append(new_field)
 1615
 1616            return fields_output
 1617
 1618        else:
 1619
 1620            return []
 1621
 1622    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1623        """
 1624        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1625        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1626        not provided.
 1627
 1628        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1629        prefix to be used for exploding or expanding information
 1630        :type explode_infos_prefix: str
 1631        :return: the value of the variable `explode_infos_prefix`.
 1632        """
 1633
 1634        if not explode_infos_prefix:
 1635            explode_infos_prefix = (
 1636                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1637            )
 1638
 1639        return explode_infos_prefix
 1640
 1641    def add_column(
 1642        self,
 1643        table_name,
 1644        column_name,
 1645        column_type,
 1646        default_value=None,
 1647        drop: bool = False,
 1648    ) -> dict:
 1649        """
 1650        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1651        doesn't already exist.
 1652
 1653        :param table_name: The name of the table to which you want to add a column
 1654        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1655        to the table
 1656        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1657        want to add to the table. It should be a string that represents the desired data type, such as
 1658        "INTEGER", "TEXT", "REAL", etc
 1659        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1660        default value for the newly added column. If a default value is provided, it will be assigned to
 1661        the column for any existing rows that do not have a value for that column
 1662        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1663        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1664        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1665        to False
 1666        :type drop: bool (optional)
 1667        :return: a boolean value indicating whether the column was successfully added to the table.
 1668        """
 1669
 1670        # added
 1671        added = False
 1672        dropped = False
 1673
 1674        # Check if the column already exists in the table
 1675        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1676        columns = self.get_query_to_df(query).columns.tolist()
 1677        if column_name.upper() in [c.upper() for c in columns]:
 1678            log.debug(
 1679                f"The {column_name} column already exists in the {table_name} table"
 1680            )
 1681            if drop:
 1682                self.drop_column(table_name=table_name, column_name=column_name)
 1683                dropped = True
 1684            else:
 1685                return None
 1686        else:
 1687            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1688
 1689        # Add column in table
 1690        add_column_query = (
 1691            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1692        )
 1693        if default_value is not None:
 1694            add_column_query += f" DEFAULT {default_value}"
 1695        self.execute_query(add_column_query)
 1696        added = not dropped
 1697        log.debug(
 1698            f"The {column_name} column was successfully added to the {table_name} table"
 1699        )
 1700
 1701        if added:
 1702            added_column = {
 1703                "table_name": table_name,
 1704                "column_name": column_name,
 1705                "column_type": column_type,
 1706                "default_value": default_value,
 1707            }
 1708        else:
 1709            added_column = None
 1710
 1711        return added_column
 1712
 1713    def drop_column(
 1714        self, column: dict = None, table_name: str = None, column_name: str = None
 1715    ) -> bool:
 1716        """
 1717        The `drop_column` function drops a specified column from a given table in a database and returns
 1718        True if the column was successfully dropped, and False if the column does not exist in the
 1719        table.
 1720
 1721        :param column: The `column` parameter is a dictionary that contains information about the column
 1722        you want to drop. It has two keys:
 1723        :type column: dict
 1724        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1725        drop a column
 1726        :type table_name: str
 1727        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1728        from the table
 1729        :type column_name: str
 1730        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1731        and False if the column does not exist in the table.
 1732        """
 1733
 1734        # Find column infos
 1735        if column:
 1736            if isinstance(column, dict):
 1737                table_name = column.get("table_name", None)
 1738                column_name = column.get("column_name", None)
 1739            elif isinstance(column, str):
 1740                table_name = self.get_table_variants()
 1741                column_name = column
 1742            else:
 1743                table_name = None
 1744                column_name = None
 1745
 1746        if not table_name and not column_name:
 1747            return False
 1748
 1749        # Removed
 1750        removed = False
 1751
 1752        # Check if the column already exists in the table
 1753        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1754        columns = self.get_query_to_df(query).columns.tolist()
 1755        if column_name in columns:
 1756            log.debug(f"The {column_name} column exists in the {table_name} table")
 1757        else:
 1758            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1759            return False
 1760
 1761        # Add column in table # ALTER TABLE integers DROP k
 1762        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1763        self.execute_query(add_column_query)
 1764        removed = True
 1765        log.debug(
 1766            f"The {column_name} column was successfully dropped to the {table_name} table"
 1767        )
 1768
 1769        return removed
 1770
 1771    def explode_infos(
 1772        self,
 1773        prefix: str = None,
 1774        create_index: bool = False,
 1775        fields: list = None,
 1776        force: bool = False,
 1777        proccess_all_fields_together: bool = False,
 1778        table: str = None,
 1779    ) -> list:
 1780        """
 1781        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1782        individual columns, returning a list of added columns.
 1783
 1784        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1785        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1786        `self.get_explode_infos_prefix()` as the prefix
 1787        :type prefix: str
 1788        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1789        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1790        `False`, indexes will not be created. The default value is `False`, defaults to False
 1791        :type create_index: bool (optional)
 1792        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1793        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1794        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1795        a list to the `
 1796        :type fields: list
 1797        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1798        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1799        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1800        defaults to False
 1801        :type force: bool (optional)
 1802        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1803        flag that determines whether to process all the INFO fields together or individually. If set to
 1804        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1805        be processed individually. The default value is, defaults to False
 1806        :type proccess_all_fields_together: bool (optional)
 1807        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1808        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1809        a value for the `table` parameter, the function will use that table name. If the `table`
 1810        parameter is
 1811        :type table: str
 1812        :return: The `explode_infos` function returns a list of added columns.
 1813        """
 1814
 1815        # drop indexes
 1816        self.drop_indexes()
 1817
 1818        # connexion format
 1819        connexion_format = self.get_connexion_format()
 1820
 1821        # Access
 1822        access = self.get_config().get("access", None)
 1823
 1824        # Added columns
 1825        added_columns = []
 1826
 1827        if access not in ["RO"]:
 1828
 1829            # prefix
 1830            if prefix in [None, True] or not isinstance(prefix, str):
 1831                if self.get_explode_infos_prefix() not in [None, True]:
 1832                    prefix = self.get_explode_infos_prefix()
 1833                else:
 1834                    prefix = "INFO/"
 1835
 1836            # table variants
 1837            if table is not None:
 1838                table_variants = table
 1839            else:
 1840                table_variants = self.get_table_variants(clause="select")
 1841
 1842            # extra infos
 1843            try:
 1844                extra_infos = self.get_extra_infos()
 1845            except:
 1846                extra_infos = []
 1847
 1848            # Header infos
 1849            header_infos = self.get_header().infos
 1850
 1851            log.debug(
 1852                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1853            )
 1854
 1855            sql_info_alter_table_array = []
 1856
 1857            # Info fields to check
 1858            fields_list = list(header_infos)
 1859            if fields:
 1860                fields_list += fields
 1861            fields_list = set(fields_list)
 1862
 1863            # If no fields
 1864            if not fields:
 1865                fields = []
 1866
 1867            # Translate fields if patterns
 1868            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1869
 1870            for info in fields:
 1871
 1872                info_id_sql = prefix + info
 1873
 1874                if (
 1875                    info in fields_list
 1876                    or prefix + info in fields_list
 1877                    or info in extra_infos
 1878                ):
 1879
 1880                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1881
 1882                    if info in header_infos:
 1883                        info_type = header_infos[info].type
 1884                        info_num = header_infos[info].num
 1885                    else:
 1886                        info_type = "String"
 1887                        info_num = 0
 1888
 1889                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1890                    if info_num != 1:
 1891                        type_sql = "VARCHAR"
 1892
 1893                    # Add field
 1894                    added_column = self.add_column(
 1895                        table_name=table_variants,
 1896                        column_name=info_id_sql,
 1897                        column_type=type_sql,
 1898                        default_value="null",
 1899                        drop=force,
 1900                    )
 1901
 1902                    if added_column:
 1903                        added_columns.append(added_column)
 1904
 1905                    if added_column or force:
 1906
 1907                        # add field to index
 1908                        self.index_additionnal_fields.append(info_id_sql)
 1909
 1910                        # Update field array
 1911                        if connexion_format in ["duckdb"]:
 1912                            update_info_field = f"""
 1913                            "{info_id_sql}" =
 1914                                CASE
 1915                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1916                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1917                                END
 1918                            """
 1919                        elif connexion_format in ["sqlite"]:
 1920                            update_info_field = f"""
 1921                                "{info_id_sql}" =
 1922                                    CASE
 1923                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1924                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1925                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1926                                    END
 1927                            """
 1928
 1929                        sql_info_alter_table_array.append(update_info_field)
 1930
 1931            if sql_info_alter_table_array:
 1932
 1933                # By chromosomes
 1934                try:
 1935                    chromosomes_list = list(
 1936                        self.get_query_to_df(
 1937                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1938                        )["#CHROM"]
 1939                    )
 1940                except:
 1941                    chromosomes_list = [None]
 1942
 1943                for chrom in chromosomes_list:
 1944                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1945
 1946                    # Where clause
 1947                    where_clause = ""
 1948                    if chrom and len(chromosomes_list) > 1:
 1949                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1950
 1951                    # Update table
 1952                    if proccess_all_fields_together:
 1953                        sql_info_alter_table_array_join = ", ".join(
 1954                            sql_info_alter_table_array
 1955                        )
 1956                        if sql_info_alter_table_array_join:
 1957                            sql_info_alter_table = f"""
 1958                                UPDATE {table_variants}
 1959                                SET {sql_info_alter_table_array_join}
 1960                                {where_clause}
 1961                                """
 1962                            log.debug(
 1963                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1964                            )
 1965                            # log.debug(sql_info_alter_table)
 1966                            self.conn.execute(sql_info_alter_table)
 1967                    else:
 1968                        sql_info_alter_num = 0
 1969                        for sql_info_alter in sql_info_alter_table_array:
 1970                            sql_info_alter_num += 1
 1971                            sql_info_alter_table = f"""
 1972                                UPDATE {table_variants}
 1973                                SET {sql_info_alter}
 1974                                {where_clause}
 1975                                """
 1976                            log.debug(
 1977                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1978                            )
 1979                            # log.debug(sql_info_alter_table)
 1980                            self.conn.execute(sql_info_alter_table)
 1981
 1982        # create indexes
 1983        if create_index:
 1984            self.create_indexes()
 1985
 1986        return added_columns
 1987
 1988    def create_indexes(self) -> None:
 1989        """
 1990        Create indexes on the table after insertion
 1991        """
 1992
 1993        # Access
 1994        access = self.get_config().get("access", None)
 1995
 1996        # get table variants
 1997        table_variants = self.get_table_variants("FROM")
 1998
 1999        if self.get_indexing() and access not in ["RO"]:
 2000            # Create index
 2001            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2002            self.conn.execute(sql_create_table_index)
 2003            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2004            self.conn.execute(sql_create_table_index)
 2005            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2006            self.conn.execute(sql_create_table_index)
 2007            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2008            self.conn.execute(sql_create_table_index)
 2009            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2010            self.conn.execute(sql_create_table_index)
 2011            for field in self.index_additionnal_fields:
 2012                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2013                self.conn.execute(sql_create_table_index)
 2014
 2015    def drop_indexes(self) -> None:
 2016        """
 2017        Create indexes on the table after insertion
 2018        """
 2019
 2020        # Access
 2021        access = self.get_config().get("access", None)
 2022
 2023        # get table variants
 2024        table_variants = self.get_table_variants("FROM")
 2025
 2026        # Get database format
 2027        connexion_format = self.get_connexion_format()
 2028
 2029        if access not in ["RO"]:
 2030            if connexion_format in ["duckdb"]:
 2031                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2032            elif connexion_format in ["sqlite"]:
 2033                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2034
 2035            list_indexes = self.conn.execute(sql_list_indexes)
 2036            index_names = [row[0] for row in list_indexes.fetchall()]
 2037            for index in index_names:
 2038                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2039                self.conn.execute(sql_drop_table_index)
 2040
 2041    def read_vcf_header(self, f) -> list:
 2042        """
 2043        It reads the header of a VCF file and returns a list of the header lines
 2044
 2045        :param f: the file object
 2046        :return: The header lines of the VCF file.
 2047        """
 2048
 2049        header_list = []
 2050        for line in f:
 2051            header_list.append(line)
 2052            if line.startswith("#CHROM"):
 2053                break
 2054        return header_list
 2055
 2056    def read_vcf_header_file(self, file: str = None) -> list:
 2057        """
 2058        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2059        uncompressed files.
 2060
 2061        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2062        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2063        default to `None`
 2064        :type file: str
 2065        :return: The function `read_vcf_header_file` returns a list.
 2066        """
 2067
 2068        if self.get_input_compressed(input_file=file):
 2069            with bgzf.open(file, "rt") as f:
 2070                return self.read_vcf_header(f=f)
 2071        else:
 2072            with open(file, "rt") as f:
 2073                return self.read_vcf_header(f=f)
 2074
 2075    def execute_query(self, query: str):
 2076        """
 2077        It takes a query as an argument, executes it, and returns the results
 2078
 2079        :param query: The query to be executed
 2080        :return: The result of the query is being returned.
 2081        """
 2082        if query:
 2083            return self.conn.execute(query)  # .fetchall()
 2084        else:
 2085            return None
 2086
 2087    def export_output(
 2088        self,
 2089        output_file: str | None = None,
 2090        output_header: str | None = None,
 2091        export_header: bool = True,
 2092        query: str | None = None,
 2093        parquet_partitions: list | None = None,
 2094        chunk_size: int | None = None,
 2095        threads: int | None = None,
 2096        sort: bool = False,
 2097        index: bool = False,
 2098        order_by: str | None = None,
 2099        fields_to_rename: dict | None = None
 2100    ) -> bool:
 2101        """
 2102        The `export_output` function exports data from a VCF file to various formats, including VCF,
 2103        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
 2104        partitioning.
 2105        
 2106        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2107        output file where the exported data will be saved
 2108        :type output_file: str | None
 2109        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2110        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2111        header will be exported to a file with the same name as the `output_file` parameter, but with
 2112        the extension "
 2113        :type output_header: str | None
 2114        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2115        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2116        True, the header will be exported to a file. If `export_header` is False, the header will not
 2117        be, defaults to True
 2118        :type export_header: bool (optional)
 2119        :param query: The `query` parameter in the `export_output` function is an optional SQL query
 2120        that can be used to filter and select specific data from the VCF file before exporting it. If
 2121        provided, only the data that matches the query will be exported. This allows you to customize
 2122        the exported data based on
 2123        :type query: str | None
 2124        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2125        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2126        organize data in a hierarchical directory structure based on the values of one or more columns.
 2127        This can improve query performance when working with large datasets
 2128        :type parquet_partitions: list | None
 2129        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
 2130        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
 2131        multiple files. It helps in optimizing the export process by breaking down the data into
 2132        manageable chunks for processing and storage
 2133        :type chunk_size: int | None
 2134        :param threads: The `threads` parameter in the `export_output` function specifies the number of
 2135        threads to be used during the export process. It determines the level of parallelism and can
 2136        improve the performance of the export operation. If this parameter is not provided, the function
 2137        will use the default number of threads
 2138        :type threads: int | None
 2139        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
 2140        determines whether the output file should be sorted based on genomic coordinates of the
 2141        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
 2142        `False`,, defaults to False
 2143        :type sort: bool (optional)
 2144        :param index: The `index` parameter in the `export_output` function is a boolean flag that
 2145        determines whether an index should be created on the output file. If `index` is set to `True`,
 2146        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
 2147        :type index: bool (optional)
 2148        :param order_by: The `order_by` parameter in the `export_output` function is a string that
 2149        specifies the column(s) to use for sorting the output file. This parameter is only applicable
 2150        when exporting data in VCF format. It allows you to specify the column(s) based on which the
 2151        output file should be
 2152        :type order_by: str | None
 2153        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
 2154        mapping of field names to be renamed during the export process. This parameter allows you to
 2155        customize the output field names before exporting the data. Each key-value pair in the
 2156        dictionary represents the original field name as the key and the new field name
 2157        :type fields_to_rename: dict | None
 2158        :return: The `export_output` function returns a boolean value. It checks if the output file
 2159        exists and returns True if it does, or None if it doesn't.
 2160        """
 2161
 2162        # Log
 2163        log.info("Exporting...")
 2164
 2165        # Full path
 2166        output_file = full_path(output_file)
 2167        output_header = full_path(output_header)
 2168
 2169        # Config
 2170        config = self.get_config()
 2171
 2172        # Param
 2173        param = self.get_param()
 2174
 2175        # Tmp files to remove
 2176        tmp_to_remove = []
 2177
 2178        # If no output, get it
 2179        if not output_file:
 2180            output_file = self.get_output()
 2181
 2182        # If not threads
 2183        if not threads:
 2184            threads = self.get_threads()
 2185
 2186        # Rename fields
 2187        if not fields_to_rename:
 2188            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
 2189        self.rename_info_fields(fields_to_rename=fields_to_rename)
 2190
 2191        # Auto header name with extension
 2192        if export_header or output_header:
 2193            if not output_header:
 2194                output_header = f"{output_file}.hdr"
 2195            # Export header
 2196            self.export_header(output_file=output_file)
 2197
 2198        # Switch off export header if VCF output
 2199        output_file_type = get_file_format(output_file)
 2200        if output_file_type in ["vcf"]:
 2201            export_header = False
 2202            tmp_to_remove.append(output_header)
 2203
 2204        # Chunk size
 2205        if not chunk_size:
 2206            chunk_size = config.get("chunk_size", None)
 2207
 2208        # Parquet partition
 2209        if not parquet_partitions:
 2210            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2211        if parquet_partitions and isinstance(parquet_partitions, str):
 2212            parquet_partitions = parquet_partitions.split(",")
 2213
 2214        # Order by
 2215        if not order_by:
 2216            order_by = param.get("export", {}).get("order_by", "")
 2217
 2218        # Header in output
 2219        header_in_output = param.get("export", {}).get("include_header", False)
 2220
 2221        # Database
 2222        database_source = self.get_connexion()
 2223
 2224        # Connexion format
 2225        connexion_format = self.get_connexion_format()
 2226
 2227        # Explode infos
 2228        if self.get_explode_infos():
 2229            self.explode_infos(
 2230                prefix=self.get_explode_infos_prefix(),
 2231                fields=self.get_explode_infos_fields(),
 2232                force=False,
 2233            )
 2234
 2235        # if connexion_format in ["sqlite"] or query:
 2236        if connexion_format in ["sqlite"]:
 2237
 2238            # Export in Parquet
 2239            random_tmp = "".join(
 2240                random.choice(string.ascii_lowercase) for i in range(10)
 2241            )
 2242            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2243            tmp_to_remove.append(database_source)
 2244
 2245            # Table Variants
 2246            table_variants = self.get_table_variants()
 2247
 2248            # Create export query
 2249            sql_query_export_subquery = f"""
 2250                SELECT * FROM {table_variants}
 2251                """
 2252
 2253            # Write source file
 2254            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2255
 2256        # Create database
 2257        database = Database(
 2258            database=database_source,
 2259            table="variants",
 2260            header_file=output_header,
 2261            conn_config=self.get_connexion_config(),
 2262        )
 2263
 2264        # Existing colomns header
 2265        existing_columns_header = database.get_header_columns_from_database(query=query)
 2266
 2267        # Sample list
 2268        if output_file_type in ["vcf"]:
 2269            get_samples = self.get_samples()
 2270            get_samples_check = self.get_samples_check()
 2271            samples_force = get_samples is not None
 2272            sample_list = self.get_header_sample_list(
 2273                check=get_samples_check,
 2274                samples=get_samples,
 2275                samples_force=samples_force,
 2276            )
 2277        else:
 2278            sample_list = None
 2279
 2280        # Export file
 2281        database.export(
 2282            output_database=output_file,
 2283            output_header=output_header,
 2284            existing_columns_header=existing_columns_header,
 2285            parquet_partitions=parquet_partitions,
 2286            chunk_size=chunk_size,
 2287            threads=threads,
 2288            sort=sort,
 2289            index=index,
 2290            header_in_output=header_in_output,
 2291            order_by=order_by,
 2292            query=query,
 2293            export_header=export_header,
 2294            sample_list=sample_list,
 2295        )
 2296
 2297        # Remove
 2298        remove_if_exists(tmp_to_remove)
 2299
 2300        return (os.path.exists(output_file) or None) and (
 2301            os.path.exists(output_file) or None
 2302        )
 2303
 2304    def get_extra_infos(self, table: str = None) -> list:
 2305        """
 2306        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2307        in the header.
 2308
 2309        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2310        name of the table from which you want to retrieve the extra columns that are not present in the
 2311        header. If the `table` parameter is not provided when calling the function, it will default to
 2312        using the variants
 2313        :type table: str
 2314        :return: A list of columns that are in the specified table but not in the header of the table.
 2315        """
 2316
 2317        header_columns = []
 2318
 2319        if not table:
 2320            table = self.get_table_variants(clause="from")
 2321            header_columns = self.get_header_columns()
 2322
 2323        # Check all columns in the database
 2324        query = f""" SELECT * FROM {table} LIMIT 1 """
 2325        log.debug(f"query {query}")
 2326        table_columns = self.get_query_to_df(query).columns.tolist()
 2327        extra_columns = []
 2328
 2329        # Construct extra infos (not in header)
 2330        for column in table_columns:
 2331            if column not in header_columns:
 2332                extra_columns.append(column)
 2333
 2334        return extra_columns
 2335
 2336    def get_extra_infos_sql(self, table: str = None) -> str:
 2337        """
 2338        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2339        by double quotes
 2340
 2341        :param table: The name of the table to get the extra infos from. If None, the default table is
 2342        used
 2343        :type table: str
 2344        :return: A string of the extra infos
 2345        """
 2346
 2347        return ", ".join(
 2348            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2349        )
 2350
 2351    def export_header(
 2352        self,
 2353        header_name: str = None,
 2354        output_file: str = None,
 2355        output_file_ext: str = ".hdr",
 2356        clean_header: bool = True,
 2357        remove_chrom_line: bool = False,
 2358    ) -> str:
 2359        """
 2360        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2361        specified options, and writes it to a new file.
 2362
 2363        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2364        this parameter is not specified, the header will be written to the output file
 2365        :type header_name: str
 2366        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2367        specify the name of the output file where the header will be written. If this parameter is not
 2368        provided, the header will be written to a temporary file
 2369        :type output_file: str
 2370        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2371        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2372        if not specified by the user. This extension will be appended to the `output_file` name to
 2373        create the final, defaults to .hdr
 2374        :type output_file_ext: str (optional)
 2375        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2376        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2377        `True`, the function will clean the header by modifying certain lines based on a specific
 2378        pattern. If `clean_header`, defaults to True
 2379        :type clean_header: bool (optional)
 2380        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2381        boolean flag that determines whether the #CHROM line should be removed from the header before
 2382        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2383        defaults to False
 2384        :type remove_chrom_line: bool (optional)
 2385        :return: The function `export_header` returns the name of the temporary header file that is
 2386        created.
 2387        """
 2388
 2389        if not header_name and not output_file:
 2390            output_file = self.get_output()
 2391
 2392        if self.get_header():
 2393
 2394            # Get header object
 2395            header_obj = self.get_header()
 2396
 2397            # Create database
 2398            db_for_header = Database(database=self.get_input())
 2399
 2400            # Get real columns in the file
 2401            db_header_columns = db_for_header.get_columns()
 2402
 2403            with tempfile.TemporaryDirectory() as tmpdir:
 2404
 2405                # Write header file
 2406                header_file_tmp = os.path.join(tmpdir, "header")
 2407                f = open(header_file_tmp, "w")
 2408                vcf.Writer(f, header_obj)
 2409                f.close()
 2410
 2411                # Replace #CHROM line with rel columns
 2412                header_list = db_for_header.read_header_file(
 2413                    header_file=header_file_tmp
 2414                )
 2415                header_list[-1] = "\t".join(db_header_columns)
 2416
 2417                # Remove CHROM line
 2418                if remove_chrom_line:
 2419                    header_list.pop()
 2420
 2421                # Clean header
 2422                if clean_header:
 2423                    header_list_clean = []
 2424                    for head in header_list:
 2425                        # Clean head for malformed header
 2426                        head_clean = head
 2427                        head_clean = re.subn(
 2428                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2429                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2430                            head_clean,
 2431                            2,
 2432                        )[0]
 2433                        # Write header
 2434                        header_list_clean.append(head_clean)
 2435                    header_list = header_list_clean
 2436
 2437            tmp_header_name = output_file + output_file_ext
 2438
 2439            f = open(tmp_header_name, "w")
 2440            for line in header_list:
 2441                f.write(line)
 2442            f.close()
 2443
 2444        return tmp_header_name
 2445
 2446    def export_variant_vcf(
 2447        self,
 2448        vcf_file,
 2449        remove_info: bool = False,
 2450        add_samples: bool = True,
 2451        list_samples: list = [],
 2452        where_clause: str = "",
 2453        index: bool = False,
 2454        threads: int | None = None,
 2455    ) -> bool | None:
 2456        """
 2457        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2458        remove INFO field, add samples, and control compression and indexing.
 2459
 2460        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2461        written to. It is the output file that will contain the filtered VCF data based on the specified
 2462        parameters
 2463        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2464        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2465        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2466        in, defaults to False
 2467        :type remove_info: bool (optional)
 2468        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2469        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2470        If set to False, the samples will be removed. The default value is True, defaults to True
 2471        :type add_samples: bool (optional)
 2472        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2473        in the output VCF file. By default, all samples will be included. If you provide a list of
 2474        samples, only those samples will be included in the output file
 2475        :type list_samples: list
 2476        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2477        determines whether or not to create an index for the output VCF file. If `index` is set to
 2478        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2479        :type index: bool (optional)
 2480        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2481        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2482        will be used during the export process. More threads can potentially speed up the export process
 2483        by utilizing multiple cores of the processor. If
 2484        :type threads: int | None
 2485        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2486        method with various parameters including the output file, query, threads, sort flag, and index
 2487        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2488        specified parameters and configurations provided in the `export_variant_vcf` function.
 2489        """
 2490
 2491        # Config
 2492        config = self.get_config()
 2493
 2494        # Extract VCF
 2495        log.debug("Export VCF...")
 2496
 2497        # Table variants
 2498        table_variants = self.get_table_variants()
 2499
 2500        # Threads
 2501        if not threads:
 2502            threads = self.get_threads()
 2503
 2504        # Info fields
 2505        if remove_info:
 2506            if not isinstance(remove_info, str):
 2507                remove_info = "."
 2508            info_field = f"""'{remove_info}' as INFO"""
 2509        else:
 2510            info_field = "INFO"
 2511
 2512        # Samples fields
 2513        if add_samples:
 2514            if not list_samples:
 2515                list_samples = self.get_header_sample_list()
 2516            if list_samples:
 2517                samples_fields = " , FORMAT , " + " , ".join(
 2518                    [f""" "{sample}" """ for sample in list_samples]
 2519                )
 2520            else:
 2521                samples_fields = ""
 2522            log.debug(f"samples_fields: {samples_fields}")
 2523        else:
 2524            samples_fields = ""
 2525
 2526        # Where clause
 2527        if where_clause is None:
 2528            where_clause = ""
 2529
 2530        # Variants
 2531        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2532        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2533        log.debug(f"sql_query_select={sql_query_select}")
 2534
 2535        return self.export_output(
 2536            output_file=vcf_file,
 2537            output_header=None,
 2538            export_header=True,
 2539            query=sql_query_select,
 2540            parquet_partitions=None,
 2541            chunk_size=config.get("chunk_size", None),
 2542            threads=threads,
 2543            sort=True,
 2544            index=index,
 2545            order_by=None,
 2546        )
 2547
 2548    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2549        """
 2550        It takes a list of commands and runs them in parallel using the number of threads specified
 2551
 2552        :param commands: A list of commands to run
 2553        :param threads: The number of threads to use, defaults to 1 (optional)
 2554        """
 2555
 2556        run_parallel_commands(commands, threads)
 2557
 2558    def get_threads(self, default: int = 1) -> int:
 2559        """
 2560        This function returns the number of threads to use for a job, with a default value of 1 if not
 2561        specified.
 2562
 2563        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2564        default number of threads to use if no specific value is provided. If no value is provided for
 2565        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2566        used, defaults to 1
 2567        :type default: int (optional)
 2568        :return: the number of threads to use for the current job.
 2569        """
 2570
 2571        # Config
 2572        config = self.get_config()
 2573
 2574        # Param
 2575        param = self.get_param()
 2576
 2577        # Input threads
 2578        input_thread = param.get("threads", config.get("threads", None))
 2579
 2580        # Check threads
 2581        if not input_thread:
 2582            threads = default
 2583        elif int(input_thread) <= 0:
 2584            threads = os.cpu_count()
 2585        else:
 2586            threads = int(input_thread)
 2587        return threads
 2588
 2589    def get_memory(self, default: str = None) -> str:
 2590        """
 2591        This function retrieves the memory value from parameters or configuration with a default value
 2592        if not found.
 2593
 2594        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2595        default value is used as a fallback in case the `memory` parameter is not provided in the
 2596        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2597        the function
 2598        :type default: str
 2599        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2600        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2601        return the default value provided as an argument to the function.
 2602        """
 2603
 2604        # Config
 2605        config = self.get_config()
 2606
 2607        # Param
 2608        param = self.get_param()
 2609
 2610        # Input threads
 2611        input_memory = param.get("memory", config.get("memory", None))
 2612
 2613        # Check threads
 2614        if input_memory:
 2615            memory = input_memory
 2616        else:
 2617            memory = default
 2618
 2619        return memory
 2620
 2621    def update_from_vcf(self, vcf_file: str) -> None:
 2622        """
 2623        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2624
 2625        :param vcf_file: the path to the VCF file
 2626        """
 2627
 2628        connexion_format = self.get_connexion_format()
 2629
 2630        if connexion_format in ["duckdb"]:
 2631            self.update_from_vcf_duckdb(vcf_file)
 2632        elif connexion_format in ["sqlite"]:
 2633            self.update_from_vcf_sqlite(vcf_file)
 2634
 2635    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2636        """
 2637        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2638        INFO column of the VCF file
 2639
 2640        :param vcf_file: the path to the VCF file
 2641        """
 2642
 2643        # varaints table
 2644        table_variants = self.get_table_variants()
 2645
 2646        # Loading VCF into temporaire table
 2647        skip = self.get_header_length(file=vcf_file)
 2648        vcf_df = pd.read_csv(
 2649            vcf_file,
 2650            sep="\t",
 2651            engine="c",
 2652            skiprows=skip,
 2653            header=0,
 2654            low_memory=False,
 2655        )
 2656        sql_query_update = f"""
 2657        UPDATE {table_variants} as table_variants
 2658            SET INFO = concat(
 2659                            CASE
 2660                                WHEN INFO NOT IN ('', '.')
 2661                                THEN INFO
 2662                                ELSE ''
 2663                            END,
 2664                            (
 2665                                SELECT 
 2666                                    concat(
 2667                                        CASE
 2668                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2669                                            THEN ';'
 2670                                            ELSE ''
 2671                                        END
 2672                                        ,
 2673                                        CASE
 2674                                            WHEN table_parquet.INFO NOT IN ('','.')
 2675                                            THEN table_parquet.INFO
 2676                                            ELSE ''
 2677                                        END
 2678                                    )
 2679                                FROM vcf_df as table_parquet
 2680                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2681                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2682                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2683                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2684                                        AND table_parquet.INFO NOT IN ('','.')
 2685                            )
 2686                        )
 2687            ;
 2688            """
 2689        self.conn.execute(sql_query_update)
 2690
 2691    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2692        """
 2693        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2694        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2695        table
 2696
 2697        :param vcf_file: The path to the VCF file you want to update the database with
 2698        """
 2699
 2700        # Create a temporary table for the VCF
 2701        table_vcf = "tmp_vcf"
 2702        sql_create = (
 2703            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2704        )
 2705        self.conn.execute(sql_create)
 2706
 2707        # Loading VCF into temporaire table
 2708        vcf_df = pd.read_csv(
 2709            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2710        )
 2711        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2712        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2713
 2714        # Update table 'variants' with VCF data
 2715        # warning: CONCAT as || operator
 2716        sql_query_update = f"""
 2717            UPDATE variants as table_variants
 2718            SET INFO = CASE
 2719                            WHEN INFO NOT IN ('', '.')
 2720                            THEN INFO
 2721                            ELSE ''
 2722                        END ||
 2723                        (
 2724                        SELECT 
 2725                            CASE 
 2726                                WHEN table_variants.INFO NOT IN ('','.') 
 2727                                    AND table_vcf.INFO NOT IN ('','.')  
 2728                                THEN ';' 
 2729                                ELSE '' 
 2730                            END || 
 2731                            CASE 
 2732                                WHEN table_vcf.INFO NOT IN ('','.') 
 2733                                THEN table_vcf.INFO 
 2734                                ELSE '' 
 2735                            END
 2736                        FROM {table_vcf} as table_vcf
 2737                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2738                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2739                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2740                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2741                        )
 2742        """
 2743        self.conn.execute(sql_query_update)
 2744
 2745        # Drop temporary table
 2746        sql_drop = f"DROP TABLE {table_vcf}"
 2747        self.conn.execute(sql_drop)
 2748
 2749    def drop_variants_table(self) -> None:
 2750        """
 2751        > This function drops the variants table
 2752        """
 2753
 2754        table_variants = self.get_table_variants()
 2755        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2756        self.conn.execute(sql_table_variants)
 2757
 2758    def set_variant_id(
 2759        self, variant_id_column: str = "variant_id", force: bool = None
 2760    ) -> str:
 2761        """
 2762        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2763        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2764
 2765        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2766        to variant_id
 2767        :type variant_id_column: str (optional)
 2768        :param force: If True, the variant_id column will be created even if it already exists
 2769        :type force: bool
 2770        :return: The name of the column that contains the variant_id
 2771        """
 2772
 2773        # Assembly
 2774        assembly = self.get_param().get(
 2775            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2776        )
 2777
 2778        # INFO/Tag prefix
 2779        prefix = self.get_explode_infos_prefix()
 2780
 2781        # Explode INFO/SVTYPE
 2782        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2783
 2784        # variants table
 2785        table_variants = self.get_table_variants()
 2786
 2787        # variant_id column
 2788        if not variant_id_column:
 2789            variant_id_column = "variant_id"
 2790
 2791        # Creta variant_id column
 2792        if "variant_id" not in self.get_extra_infos() or force:
 2793
 2794            # Create column
 2795            self.add_column(
 2796                table_name=table_variants,
 2797                column_name=variant_id_column,
 2798                column_type="UBIGINT",
 2799                default_value="0",
 2800            )
 2801
 2802            # Update column
 2803            self.conn.execute(
 2804                f"""
 2805                    UPDATE {table_variants}
 2806                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2807                """
 2808            )
 2809
 2810        # Remove added columns
 2811        for added_column in added_columns:
 2812            self.drop_column(column=added_column)
 2813
 2814        # return variant_id column name
 2815        return variant_id_column
 2816
 2817    def get_variant_id_column(
 2818        self, variant_id_column: str = "variant_id", force: bool = None
 2819    ) -> str:
 2820        """
 2821        This function returns the variant_id column name
 2822
 2823        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2824        defaults to variant_id
 2825        :type variant_id_column: str (optional)
 2826        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2827        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2828        if it is not already set, or if it is set
 2829        :type force: bool
 2830        :return: The variant_id column name.
 2831        """
 2832
 2833        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2834
 2835    ###
 2836    # Annotation
 2837    ###
 2838
 2839    def scan_databases(
 2840        self,
 2841        database_formats: list = ["parquet"],
 2842        database_releases: list = ["current"],
 2843    ) -> dict:
 2844        """
 2845        The function `scan_databases` scans for available databases based on specified formats and
 2846        releases.
 2847
 2848        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2849        of the databases to be scanned. In this case, the accepted format is "parquet"
 2850        :type database_formats: list ["parquet"]
 2851        :param database_releases: The `database_releases` parameter is a list that specifies the
 2852        releases of the databases to be scanned. In the provided function, the default value for
 2853        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2854        databases that are in the "current"
 2855        :type database_releases: list
 2856        :return: The function `scan_databases` returns a dictionary containing information about
 2857        databases that match the specified formats and releases.
 2858        """
 2859
 2860        # Config
 2861        config = self.get_config()
 2862
 2863        # Param
 2864        param = self.get_param()
 2865
 2866        # Param - Assembly
 2867        assembly = param.get("assembly", config.get("assembly", None))
 2868        if not assembly:
 2869            assembly = DEFAULT_ASSEMBLY
 2870            log.warning(f"Default assembly '{assembly}'")
 2871
 2872        # Scan for availabled databases
 2873        log.info(
 2874            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2875        )
 2876        databases_infos_dict = databases_infos(
 2877            database_folder_releases=database_releases,
 2878            database_formats=database_formats,
 2879            assembly=assembly,
 2880            config=config,
 2881        )
 2882        log.info(
 2883            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2884        )
 2885
 2886        return databases_infos_dict
 2887
 2888    def annotation(self) -> None:
 2889        """
 2890        It annotates the VCF file with the annotations specified in the config file.
 2891        """
 2892
 2893        # Config
 2894        config = self.get_config()
 2895
 2896        # Param
 2897        param = self.get_param()
 2898
 2899        # Param - Assembly
 2900        assembly = param.get("assembly", config.get("assembly", None))
 2901        if not assembly:
 2902            assembly = DEFAULT_ASSEMBLY
 2903            log.warning(f"Default assembly '{assembly}'")
 2904
 2905        # annotations databases folders
 2906        annotations_databases = set(
 2907            config.get("folders", {})
 2908            .get("databases", {})
 2909            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2910            + config.get("folders", {})
 2911            .get("databases", {})
 2912            .get("parquet", ["~/howard/databases/parquet/current"])
 2913            + config.get("folders", {})
 2914            .get("databases", {})
 2915            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2916        )
 2917
 2918        # Get param annotations
 2919        if param.get("annotations", None) and isinstance(
 2920            param.get("annotations", None), str
 2921        ):
 2922            log.debug(param.get("annotations", None))
 2923            param_annotation_list = param.get("annotations").split(",")
 2924        else:
 2925            param_annotation_list = []
 2926
 2927        # Each tools param
 2928        if param.get("annotation_parquet", None) != None:
 2929            log.debug(
 2930                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2931            )
 2932            if isinstance(param.get("annotation_parquet", None), list):
 2933                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2934            else:
 2935                param_annotation_list.append(param.get("annotation_parquet"))
 2936        if param.get("annotation_snpsift", None) != None:
 2937            if isinstance(param.get("annotation_snpsift", None), list):
 2938                param_annotation_list.append(
 2939                    "snpsift:"
 2940                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2941                )
 2942            else:
 2943                param_annotation_list.append(
 2944                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2945                )
 2946        if param.get("annotation_snpeff", None) != None:
 2947            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2948        if param.get("annotation_bcftools", None) != None:
 2949            if isinstance(param.get("annotation_bcftools", None), list):
 2950                param_annotation_list.append(
 2951                    "bcftools:"
 2952                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2953                )
 2954            else:
 2955                param_annotation_list.append(
 2956                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2957                )
 2958        if param.get("annotation_annovar", None) != None:
 2959            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2960        if param.get("annotation_exomiser", None) != None:
 2961            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2962        if param.get("annotation_splice", None) != None:
 2963            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2964
 2965        # Merge param annotations list
 2966        param["annotations"] = ",".join(param_annotation_list)
 2967
 2968        # debug
 2969        log.debug(f"param_annotations={param['annotations']}")
 2970
 2971        if param.get("annotations"):
 2972
 2973            # Log
 2974            # log.info("Annotations - Check annotation parameters")
 2975
 2976            if not "annotation" in param:
 2977                param["annotation"] = {}
 2978
 2979            # List of annotations parameters
 2980            annotations_list_input = {}
 2981            if isinstance(param.get("annotations", None), str):
 2982                annotation_file_list = [
 2983                    value for value in param.get("annotations", "").split(",")
 2984                ]
 2985                for annotation_file in annotation_file_list:
 2986                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2987            else:
 2988                annotations_list_input = param.get("annotations", {})
 2989
 2990            log.info(f"Quick Annotations:")
 2991            for annotation_key in list(annotations_list_input.keys()):
 2992                log.info(f"   {annotation_key}")
 2993
 2994            # List of annotations and associated fields
 2995            annotations_list = {}
 2996
 2997            for annotation_file in annotations_list_input:
 2998
 2999                # Explode annotations if ALL
 3000                if (
 3001                    annotation_file.upper() == "ALL"
 3002                    or annotation_file.upper().startswith("ALL:")
 3003                ):
 3004
 3005                    # check ALL parameters (formats, releases)
 3006                    annotation_file_split = annotation_file.split(":")
 3007                    database_formats = "parquet"
 3008                    database_releases = "current"
 3009                    for annotation_file_option in annotation_file_split[1:]:
 3010                        database_all_options_split = annotation_file_option.split("=")
 3011                        if database_all_options_split[0] == "format":
 3012                            database_formats = database_all_options_split[1].split("+")
 3013                        if database_all_options_split[0] == "release":
 3014                            database_releases = database_all_options_split[1].split("+")
 3015
 3016                    # Scan for availabled databases
 3017                    databases_infos_dict = self.scan_databases(
 3018                        database_formats=database_formats,
 3019                        database_releases=database_releases,
 3020                    )
 3021
 3022                    # Add found databases in annotation parameters
 3023                    for database_infos in databases_infos_dict.keys():
 3024                        annotations_list[database_infos] = {"INFO": None}
 3025
 3026                else:
 3027                    annotations_list[annotation_file] = annotations_list_input[
 3028                        annotation_file
 3029                    ]
 3030
 3031            # Check each databases
 3032            if len(annotations_list):
 3033
 3034                log.info(
 3035                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3036                )
 3037
 3038                for annotation_file in annotations_list:
 3039
 3040                    # Init
 3041                    annotations = annotations_list.get(annotation_file, None)
 3042
 3043                    # Annotation snpEff
 3044                    if annotation_file.startswith("snpeff"):
 3045
 3046                        log.debug(f"Quick Annotation snpEff")
 3047
 3048                        if "snpeff" not in param["annotation"]:
 3049                            param["annotation"]["snpeff"] = {}
 3050
 3051                        if "options" not in param["annotation"]["snpeff"]:
 3052                            param["annotation"]["snpeff"]["options"] = ""
 3053
 3054                        # snpEff options in annotations
 3055                        param["annotation"]["snpeff"]["options"] = "".join(
 3056                            annotation_file.split(":")[1:]
 3057                        )
 3058
 3059                    # Annotation Annovar
 3060                    elif annotation_file.startswith("annovar"):
 3061
 3062                        log.debug(f"Quick Annotation Annovar")
 3063
 3064                        if "annovar" not in param["annotation"]:
 3065                            param["annotation"]["annovar"] = {}
 3066
 3067                        if "annotations" not in param["annotation"]["annovar"]:
 3068                            param["annotation"]["annovar"]["annotations"] = {}
 3069
 3070                        # Options
 3071                        annotation_file_split = annotation_file.split(":")
 3072                        for annotation_file_annotation in annotation_file_split[1:]:
 3073                            if annotation_file_annotation:
 3074                                param["annotation"]["annovar"]["annotations"][
 3075                                    annotation_file_annotation
 3076                                ] = annotations
 3077
 3078                    # Annotation Exomiser
 3079                    elif annotation_file.startswith("exomiser"):
 3080
 3081                        log.debug(f"Quick Annotation Exomiser")
 3082
 3083                        param["annotation"]["exomiser"] = params_string_to_dict(
 3084                            annotation_file
 3085                        )
 3086
 3087                    # Annotation Splice
 3088                    elif annotation_file.startswith("splice"):
 3089
 3090                        log.debug(f"Quick Annotation Splice")
 3091
 3092                        param["annotation"]["splice"] = params_string_to_dict(
 3093                            annotation_file
 3094                        )
 3095
 3096                    # Annotation Parquet or BCFTOOLS
 3097                    else:
 3098
 3099                        # Tools detection
 3100                        if annotation_file.startswith("bcftools:"):
 3101                            annotation_tool_initial = "bcftools"
 3102                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3103                        elif annotation_file.startswith("snpsift:"):
 3104                            annotation_tool_initial = "snpsift"
 3105                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3106                        elif annotation_file.startswith("bigwig:"):
 3107                            annotation_tool_initial = "bigwig"
 3108                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3109                        else:
 3110                            annotation_tool_initial = None
 3111
 3112                        # list of files
 3113                        annotation_file_list = annotation_file.replace("+", ":").split(
 3114                            ":"
 3115                        )
 3116
 3117                        for annotation_file in annotation_file_list:
 3118
 3119                            if annotation_file:
 3120
 3121                                # Annotation tool initial
 3122                                annotation_tool = annotation_tool_initial
 3123
 3124                                # Find file
 3125                                annotation_file_found = None
 3126
 3127                                if os.path.exists(annotation_file):
 3128                                    annotation_file_found = annotation_file
 3129                                elif os.path.exists(full_path(annotation_file)):
 3130                                    annotation_file_found = full_path(annotation_file)
 3131                                else:
 3132                                    # Find within assembly folders
 3133                                    for annotations_database in annotations_databases:
 3134                                        found_files = find_all(
 3135                                            annotation_file,
 3136                                            os.path.join(
 3137                                                annotations_database, assembly
 3138                                            ),
 3139                                        )
 3140                                        if len(found_files) > 0:
 3141                                            annotation_file_found = found_files[0]
 3142                                            break
 3143                                    if not annotation_file_found and not assembly:
 3144                                        # Find within folders
 3145                                        for (
 3146                                            annotations_database
 3147                                        ) in annotations_databases:
 3148                                            found_files = find_all(
 3149                                                annotation_file, annotations_database
 3150                                            )
 3151                                            if len(found_files) > 0:
 3152                                                annotation_file_found = found_files[0]
 3153                                                break
 3154                                log.debug(
 3155                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3156                                )
 3157
 3158                                # Full path
 3159                                annotation_file_found = full_path(annotation_file_found)
 3160
 3161                                if annotation_file_found:
 3162
 3163                                    database = Database(database=annotation_file_found)
 3164                                    quick_annotation_format = database.get_format()
 3165                                    quick_annotation_is_compressed = (
 3166                                        database.is_compressed()
 3167                                    )
 3168                                    quick_annotation_is_indexed = os.path.exists(
 3169                                        f"{annotation_file_found}.tbi"
 3170                                    )
 3171                                    bcftools_preference = False
 3172
 3173                                    # Check Annotation Tool
 3174                                    if not annotation_tool:
 3175                                        if (
 3176                                            bcftools_preference
 3177                                            and quick_annotation_format
 3178                                            in ["vcf", "bed"]
 3179                                            and quick_annotation_is_compressed
 3180                                            and quick_annotation_is_indexed
 3181                                        ):
 3182                                            annotation_tool = "bcftools"
 3183                                        elif quick_annotation_format in [
 3184                                            "vcf",
 3185                                            "bed",
 3186                                            "tsv",
 3187                                            "tsv",
 3188                                            "csv",
 3189                                            "json",
 3190                                            "tbl",
 3191                                            "parquet",
 3192                                            "duckdb",
 3193                                        ]:
 3194                                            annotation_tool = "parquet"
 3195                                        elif quick_annotation_format in ["bw"]:
 3196                                            annotation_tool = "bigwig"
 3197                                        else:
 3198                                            log.error(
 3199                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3200                                            )
 3201                                            raise ValueError(
 3202                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3203                                            )
 3204
 3205                                    log.debug(
 3206                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3207                                    )
 3208
 3209                                    # Annotation Tool dispatch
 3210                                    if annotation_tool:
 3211                                        if annotation_tool not in param["annotation"]:
 3212                                            param["annotation"][annotation_tool] = {}
 3213                                        if (
 3214                                            "annotations"
 3215                                            not in param["annotation"][annotation_tool]
 3216                                        ):
 3217                                            param["annotation"][annotation_tool][
 3218                                                "annotations"
 3219                                            ] = {}
 3220                                        param["annotation"][annotation_tool][
 3221                                            "annotations"
 3222                                        ][annotation_file_found] = annotations
 3223
 3224                                else:
 3225                                    log.warning(
 3226                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3227                                    )
 3228
 3229                self.set_param(param)
 3230
 3231        if param.get("annotation", None):
 3232            log.info("Annotations")
 3233            if param.get("annotation", {}).get("parquet", None):
 3234                log.info("Annotations 'parquet'...")
 3235                self.annotation_parquet()
 3236            if param.get("annotation", {}).get("bcftools", None):
 3237                log.info("Annotations 'bcftools'...")
 3238                self.annotation_bcftools()
 3239            if param.get("annotation", {}).get("snpsift", None):
 3240                log.info("Annotations 'snpsift'...")
 3241                self.annotation_snpsift()
 3242            if param.get("annotation", {}).get("bigwig", None):
 3243                log.info("Annotations 'bigwig'...")
 3244                self.annotation_bigwig()
 3245            if param.get("annotation", {}).get("annovar", None):
 3246                log.info("Annotations 'annovar'...")
 3247                self.annotation_annovar()
 3248            if param.get("annotation", {}).get("snpeff", None):
 3249                log.info("Annotations 'snpeff'...")
 3250                self.annotation_snpeff()
 3251            if param.get("annotation", {}).get("exomiser", None) is not None:
 3252                log.info("Annotations 'exomiser'...")
 3253                self.annotation_exomiser()
 3254            if param.get("annotation", {}).get("splice", None) is not None:
 3255                log.info("Annotations 'splice' ...")
 3256                self.annotation_splice()
 3257
 3258        # Explode INFOS fields into table fields
 3259        if self.get_explode_infos():
 3260            self.explode_infos(
 3261                prefix=self.get_explode_infos_prefix(),
 3262                fields=self.get_explode_infos_fields(),
 3263                force=True,
 3264            )
 3265
 3266    def annotation_bigwig(self, threads: int = None) -> None:
 3267        """
 3268        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3269
 3270        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3271        number of threads to be used for parallel processing during the annotation process. If the
 3272        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3273        threads to use based on the system configuration
 3274        :type threads: int
 3275        :return: True
 3276        """
 3277
 3278        # DEBUG
 3279        log.debug("Start annotation with bigwig databases")
 3280
 3281        # # Threads
 3282        # if not threads:
 3283        #     threads = self.get_threads()
 3284        # log.debug("Threads: " + str(threads))
 3285
 3286        # Config
 3287        config = self.get_config()
 3288        log.debug("Config: " + str(config))
 3289
 3290        # Config - BCFTools databases folders
 3291        databases_folders = set(
 3292            self.get_config()
 3293            .get("folders", {})
 3294            .get("databases", {})
 3295            .get("annotations", ["."])
 3296            + self.get_config()
 3297            .get("folders", {})
 3298            .get("databases", {})
 3299            .get("bigwig", ["."])
 3300        )
 3301        log.debug("Databases annotations: " + str(databases_folders))
 3302
 3303        # Param
 3304        annotations = (
 3305            self.get_param()
 3306            .get("annotation", {})
 3307            .get("bigwig", {})
 3308            .get("annotations", None)
 3309        )
 3310        log.debug("Annotations: " + str(annotations))
 3311
 3312        # Assembly
 3313        assembly = self.get_param().get(
 3314            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3315        )
 3316
 3317        # Data
 3318        table_variants = self.get_table_variants()
 3319
 3320        # Check if not empty
 3321        log.debug("Check if not empty")
 3322        sql_query_chromosomes = (
 3323            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3324        )
 3325        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3326        if not sql_query_chromosomes_df["count"][0]:
 3327            log.info(f"VCF empty")
 3328            return
 3329
 3330        # VCF header
 3331        vcf_reader = self.get_header()
 3332        log.debug("Initial header: " + str(vcf_reader.infos))
 3333
 3334        # Existing annotations
 3335        for vcf_annotation in self.get_header().infos:
 3336
 3337            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3338            log.debug(
 3339                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3340            )
 3341
 3342        if annotations:
 3343
 3344            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3345
 3346                # Export VCF file
 3347                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3348
 3349                # annotation_bigwig_config
 3350                annotation_bigwig_config_list = []
 3351
 3352                for annotation in annotations:
 3353                    annotation_fields = annotations[annotation]
 3354
 3355                    # Annotation Name
 3356                    annotation_name = os.path.basename(annotation)
 3357
 3358                    if not annotation_fields:
 3359                        annotation_fields = {"INFO": None}
 3360
 3361                    log.debug(f"Annotation '{annotation_name}'")
 3362                    log.debug(
 3363                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3364                    )
 3365
 3366                    # Create Database
 3367                    database = Database(
 3368                        database=annotation,
 3369                        databases_folders=databases_folders,
 3370                        assembly=assembly,
 3371                    )
 3372
 3373                    # Find files
 3374                    db_file = database.get_database()
 3375                    db_file = full_path(db_file)
 3376                    db_hdr_file = database.get_header_file()
 3377                    db_hdr_file = full_path(db_hdr_file)
 3378                    db_file_type = database.get_format()
 3379
 3380                    # If db_file is http ?
 3381                    if database.get_database().startswith("http"):
 3382
 3383                        # Datbase is HTTP URL
 3384                        db_file_is_http = True
 3385
 3386                        # DB file keep as URL
 3387                        db_file = database.get_database()
 3388                        log.warning(
 3389                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3390                        )
 3391
 3392                        # Retrieve automatic annotation field name
 3393                        annotation_field = clean_annotation_field(
 3394                            os.path.basename(db_file).replace(".bw", "")
 3395                        )
 3396                        log.debug(
 3397                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3398                        )
 3399
 3400                        # Create automatic header file
 3401                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3402                        with open(db_hdr_file, "w") as f:
 3403                            f.write("##fileformat=VCFv4.2\n")
 3404                            f.write(
 3405                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3406                            )
 3407                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3408
 3409                    else:
 3410
 3411                        # Datbase is NOT HTTP URL
 3412                        db_file_is_http = False
 3413
 3414                    # Check index - try to create if not exists
 3415                    if (
 3416                        db_file is None
 3417                        or db_hdr_file is None
 3418                        or (not os.path.exists(db_file) and not db_file_is_http)
 3419                        or not os.path.exists(db_hdr_file)
 3420                        or not db_file_type in ["bw"]
 3421                    ):
 3422                        # if False:
 3423                        log.error("Annotation failed: database not valid")
 3424                        log.error(f"Annotation annotation file: {db_file}")
 3425                        log.error(f"Annotation annotation file type: {db_file_type}")
 3426                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3427                        raise ValueError(
 3428                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3429                        )
 3430                    else:
 3431
 3432                        # Log
 3433                        log.debug(
 3434                            f"Annotation '{annotation}' - file: "
 3435                            + str(db_file)
 3436                            + " and "
 3437                            + str(db_hdr_file)
 3438                        )
 3439
 3440                        # Load header as VCF object
 3441                        db_hdr_vcf = Variants(input=db_hdr_file)
 3442                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3443                        log.debug(
 3444                            "Annotation database header: "
 3445                            + str(db_hdr_vcf_header_infos)
 3446                        )
 3447
 3448                        # For all fields in database
 3449                        annotation_fields_full = False
 3450                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3451                            annotation_fields = {
 3452                                key: key for key in db_hdr_vcf_header_infos
 3453                            }
 3454                            log.debug(
 3455                                "Annotation database header - All annotations added: "
 3456                                + str(annotation_fields)
 3457                            )
 3458                            annotation_fields_full = True
 3459
 3460                        # Init
 3461                        cyvcf2_header_rename_dict = {}
 3462                        cyvcf2_header_list = []
 3463                        cyvcf2_header_indexes = {}
 3464
 3465                        # process annotation fields
 3466                        for annotation_field in annotation_fields:
 3467
 3468                            # New annotation name
 3469                            annotation_field_new = annotation_fields[annotation_field]
 3470
 3471                            # Check annotation field and index in header
 3472                            if (
 3473                                annotation_field
 3474                                in db_hdr_vcf.get_header_columns_as_list()
 3475                            ):
 3476                                annotation_field_index = (
 3477                                    db_hdr_vcf.get_header_columns_as_list().index(
 3478                                        annotation_field
 3479                                    )
 3480                                    - 3
 3481                                )
 3482                                cyvcf2_header_indexes[annotation_field_new] = (
 3483                                    annotation_field_index
 3484                                )
 3485                            else:
 3486                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3487                                log.error(msg_err)
 3488                                raise ValueError(msg_err)
 3489
 3490                            # Append annotation field in cyvcf2 header list
 3491                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3492                                db_hdr_vcf_header_infos[annotation_field].id
 3493                            )
 3494                            cyvcf2_header_list.append(
 3495                                {
 3496                                    "ID": annotation_field_new,
 3497                                    "Number": db_hdr_vcf_header_infos[
 3498                                        annotation_field
 3499                                    ].num,
 3500                                    "Type": db_hdr_vcf_header_infos[
 3501                                        annotation_field
 3502                                    ].type,
 3503                                    "Description": db_hdr_vcf_header_infos[
 3504                                        annotation_field
 3505                                    ].desc,
 3506                                }
 3507                            )
 3508
 3509                            # Add header on VCF
 3510                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3511                                annotation_field_new,
 3512                                db_hdr_vcf_header_infos[annotation_field].num,
 3513                                db_hdr_vcf_header_infos[annotation_field].type,
 3514                                db_hdr_vcf_header_infos[annotation_field].desc,
 3515                                "HOWARD BigWig annotation",
 3516                                "unknown",
 3517                                self.code_type_map[
 3518                                    db_hdr_vcf_header_infos[annotation_field].type
 3519                                ],
 3520                            )
 3521
 3522                        # Load bigwig database
 3523                        bw_db = pyBigWig.open(db_file)
 3524                        if bw_db.isBigWig():
 3525                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3526                        else:
 3527                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3528                            log.error(msg_err)
 3529                            raise ValueError(msg_err)
 3530
 3531                        annotation_bigwig_config_list.append(
 3532                            {
 3533                                "db_file": db_file,
 3534                                "bw_db": bw_db,
 3535                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3536                                "cyvcf2_header_list": cyvcf2_header_list,
 3537                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3538                            }
 3539                        )
 3540
 3541                # Annotate
 3542                if annotation_bigwig_config_list:
 3543
 3544                    # Annotation config
 3545                    log.debug(
 3546                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3547                    )
 3548
 3549                    # Export VCF file
 3550                    self.export_variant_vcf(
 3551                        vcf_file=tmp_vcf_name,
 3552                        remove_info=True,
 3553                        add_samples=False,
 3554                        index=True,
 3555                    )
 3556
 3557                    # Load input tmp file
 3558                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3559
 3560                    # Add header in input file
 3561                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3562                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3563                            "cyvcf2_header_list", []
 3564                        ):
 3565                            log.info(
 3566                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3567                            )
 3568                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3569
 3570                    # Create output VCF file
 3571                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3572                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3573
 3574                    # Fetch variants
 3575                    log.info(f"Annotations 'bigwig' start...")
 3576                    for variant in input_vcf:
 3577
 3578                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3579
 3580                            # DB and indexes
 3581                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3582                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3583                                "cyvcf2_header_indexes", None
 3584                            )
 3585
 3586                            # Retrieve value from chrom pos
 3587                            res = bw_db.values(
 3588                                variant.CHROM, variant.POS - 1, variant.POS
 3589                            )
 3590
 3591                            # For each annotation fields (and indexes)
 3592                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3593
 3594                                # If value is NOT nNone
 3595                                if not np.isnan(
 3596                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3597                                ):
 3598                                    variant.INFO[cyvcf2_header_index] = res[
 3599                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3600                                    ]
 3601
 3602                        # Add record in output file
 3603                        output_vcf.write_record(variant)
 3604
 3605                    # Log
 3606                    log.debug(f"Annotation done.")
 3607
 3608                    # Close and write file
 3609                    log.info(f"Annotations 'bigwig' write...")
 3610                    output_vcf.close()
 3611                    log.debug(f"Write done.")
 3612
 3613                    # Update variants
 3614                    log.info(f"Annotations 'bigwig' update...")
 3615                    self.update_from_vcf(output_vcf_file)
 3616                    log.debug(f"Update done.")
 3617
 3618        return True
 3619
 3620    def annotation_snpsift(self, threads: int = None) -> None:
 3621        """
 3622        This function annotate with bcftools
 3623
 3624        :param threads: Number of threads to use
 3625        :return: the value of the variable "return_value".
 3626        """
 3627
 3628        # DEBUG
 3629        log.debug("Start annotation with bcftools databases")
 3630
 3631        # Threads
 3632        if not threads:
 3633            threads = self.get_threads()
 3634        log.debug("Threads: " + str(threads))
 3635
 3636        # Config
 3637        config = self.get_config()
 3638        log.debug("Config: " + str(config))
 3639
 3640        # Config - snpSift
 3641        snpsift_bin_command = get_bin_command(
 3642            bin="SnpSift.jar",
 3643            tool="snpsift",
 3644            bin_type="jar",
 3645            config=config,
 3646            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3647        )
 3648        if not snpsift_bin_command:
 3649            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3650            log.error(msg_err)
 3651            raise ValueError(msg_err)
 3652
 3653        # Config - bcftools
 3654        bcftools_bin_command = get_bin_command(
 3655            bin="bcftools",
 3656            tool="bcftools",
 3657            bin_type="bin",
 3658            config=config,
 3659            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3660        )
 3661        if not bcftools_bin_command:
 3662            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3663            log.error(msg_err)
 3664            raise ValueError(msg_err)
 3665
 3666        # Config - BCFTools databases folders
 3667        databases_folders = set(
 3668            self.get_config()
 3669            .get("folders", {})
 3670            .get("databases", {})
 3671            .get("annotations", ["."])
 3672            + self.get_config()
 3673            .get("folders", {})
 3674            .get("databases", {})
 3675            .get("bcftools", ["."])
 3676        )
 3677        log.debug("Databases annotations: " + str(databases_folders))
 3678
 3679        # Param
 3680        annotations = (
 3681            self.get_param()
 3682            .get("annotation", {})
 3683            .get("snpsift", {})
 3684            .get("annotations", None)
 3685        )
 3686        log.debug("Annotations: " + str(annotations))
 3687
 3688        # Assembly
 3689        assembly = self.get_param().get(
 3690            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3691        )
 3692
 3693        # Data
 3694        table_variants = self.get_table_variants()
 3695
 3696        # Check if not empty
 3697        log.debug("Check if not empty")
 3698        sql_query_chromosomes = (
 3699            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3700        )
 3701        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3702        if not sql_query_chromosomes_df["count"][0]:
 3703            log.info(f"VCF empty")
 3704            return
 3705
 3706        # VCF header
 3707        vcf_reader = self.get_header()
 3708        log.debug("Initial header: " + str(vcf_reader.infos))
 3709
 3710        # Existing annotations
 3711        for vcf_annotation in self.get_header().infos:
 3712
 3713            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3714            log.debug(
 3715                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3716            )
 3717
 3718        if annotations:
 3719
 3720            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3721
 3722                # Export VCF file
 3723                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3724
 3725                # Init
 3726                commands = {}
 3727
 3728                for annotation in annotations:
 3729                    annotation_fields = annotations[annotation]
 3730
 3731                    # Annotation Name
 3732                    annotation_name = os.path.basename(annotation)
 3733
 3734                    if not annotation_fields:
 3735                        annotation_fields = {"INFO": None}
 3736
 3737                    log.debug(f"Annotation '{annotation_name}'")
 3738                    log.debug(
 3739                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3740                    )
 3741
 3742                    # Create Database
 3743                    database = Database(
 3744                        database=annotation,
 3745                        databases_folders=databases_folders,
 3746                        assembly=assembly,
 3747                    )
 3748
 3749                    # Find files
 3750                    db_file = database.get_database()
 3751                    db_file = full_path(db_file)
 3752                    db_hdr_file = database.get_header_file()
 3753                    db_hdr_file = full_path(db_hdr_file)
 3754                    db_file_type = database.get_format()
 3755                    db_tbi_file = f"{db_file}.tbi"
 3756                    db_file_compressed = database.is_compressed()
 3757
 3758                    # Check if compressed
 3759                    if not db_file_compressed:
 3760                        log.error(
 3761                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3762                        )
 3763                        raise ValueError(
 3764                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3765                        )
 3766
 3767                    # Check if indexed
 3768                    if not os.path.exists(db_tbi_file):
 3769                        log.error(
 3770                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3771                        )
 3772                        raise ValueError(
 3773                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3774                        )
 3775
 3776                    # Check index - try to create if not exists
 3777                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3778                        log.error("Annotation failed: database not valid")
 3779                        log.error(f"Annotation annotation file: {db_file}")
 3780                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3781                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3782                        raise ValueError(
 3783                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3784                        )
 3785                    else:
 3786
 3787                        log.debug(
 3788                            f"Annotation '{annotation}' - file: "
 3789                            + str(db_file)
 3790                            + " and "
 3791                            + str(db_hdr_file)
 3792                        )
 3793
 3794                        # Load header as VCF object
 3795                        db_hdr_vcf = Variants(input=db_hdr_file)
 3796                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3797                        log.debug(
 3798                            "Annotation database header: "
 3799                            + str(db_hdr_vcf_header_infos)
 3800                        )
 3801
 3802                        # For all fields in database
 3803                        annotation_fields_full = False
 3804                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3805                            annotation_fields = {
 3806                                key: key for key in db_hdr_vcf_header_infos
 3807                            }
 3808                            log.debug(
 3809                                "Annotation database header - All annotations added: "
 3810                                + str(annotation_fields)
 3811                            )
 3812                            annotation_fields_full = True
 3813
 3814                        # # Create file for field rename
 3815                        # log.debug("Create file for field rename")
 3816                        # tmp_rename = NamedTemporaryFile(
 3817                        #     prefix=self.get_prefix(),
 3818                        #     dir=self.get_tmp_dir(),
 3819                        #     suffix=".rename",
 3820                        #     delete=False,
 3821                        # )
 3822                        # tmp_rename_name = tmp_rename.name
 3823                        # tmp_files.append(tmp_rename_name)
 3824
 3825                        # Number of fields
 3826                        nb_annotation_field = 0
 3827                        annotation_list = []
 3828                        annotation_infos_rename_list = []
 3829
 3830                        for annotation_field in annotation_fields:
 3831
 3832                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3833                            annotation_fields_new_name = annotation_fields.get(
 3834                                annotation_field, annotation_field
 3835                            )
 3836                            if not annotation_fields_new_name:
 3837                                annotation_fields_new_name = annotation_field
 3838
 3839                            # Check if field is in DB and if field is not elready in input data
 3840                            if (
 3841                                annotation_field in db_hdr_vcf.get_header().infos
 3842                                and annotation_fields_new_name
 3843                                not in self.get_header().infos
 3844                            ):
 3845
 3846                                log.info(
 3847                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3848                                )
 3849
 3850                                # BCFTools annotate param to rename fields
 3851                                if annotation_field != annotation_fields_new_name:
 3852                                    annotation_infos_rename_list.append(
 3853                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3854                                    )
 3855
 3856                                # Add INFO field to header
 3857                                db_hdr_vcf_header_infos_number = (
 3858                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3859                                )
 3860                                db_hdr_vcf_header_infos_type = (
 3861                                    db_hdr_vcf_header_infos[annotation_field].type
 3862                                    or "String"
 3863                                )
 3864                                db_hdr_vcf_header_infos_description = (
 3865                                    db_hdr_vcf_header_infos[annotation_field].desc
 3866                                    or f"{annotation_field} description"
 3867                                )
 3868                                db_hdr_vcf_header_infos_source = (
 3869                                    db_hdr_vcf_header_infos[annotation_field].source
 3870                                    or "unknown"
 3871                                )
 3872                                db_hdr_vcf_header_infos_version = (
 3873                                    db_hdr_vcf_header_infos[annotation_field].version
 3874                                    or "unknown"
 3875                                )
 3876
 3877                                vcf_reader.infos[annotation_fields_new_name] = (
 3878                                    vcf.parser._Info(
 3879                                        annotation_fields_new_name,
 3880                                        db_hdr_vcf_header_infos_number,
 3881                                        db_hdr_vcf_header_infos_type,
 3882                                        db_hdr_vcf_header_infos_description,
 3883                                        db_hdr_vcf_header_infos_source,
 3884                                        db_hdr_vcf_header_infos_version,
 3885                                        self.code_type_map[
 3886                                            db_hdr_vcf_header_infos_type
 3887                                        ],
 3888                                    )
 3889                                )
 3890
 3891                                annotation_list.append(annotation_field)
 3892
 3893                                nb_annotation_field += 1
 3894
 3895                            else:
 3896
 3897                                if (
 3898                                    annotation_field
 3899                                    not in db_hdr_vcf.get_header().infos
 3900                                ):
 3901                                    log.warning(
 3902                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3903                                    )
 3904                                if (
 3905                                    annotation_fields_new_name
 3906                                    in self.get_header().infos
 3907                                ):
 3908                                    log.warning(
 3909                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3910                                    )
 3911
 3912                        log.info(
 3913                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3914                        )
 3915
 3916                        annotation_infos = ",".join(annotation_list)
 3917
 3918                        if annotation_infos != "":
 3919
 3920                            # Annotated VCF (and error file)
 3921                            tmp_annotation_vcf_name = os.path.join(
 3922                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3923                            )
 3924                            tmp_annotation_vcf_name_err = (
 3925                                tmp_annotation_vcf_name + ".err"
 3926                            )
 3927
 3928                            # Add fields to annotate
 3929                            if not annotation_fields_full:
 3930                                annotation_infos_option = f"-info {annotation_infos}"
 3931                            else:
 3932                                annotation_infos_option = ""
 3933
 3934                            # Info fields rename
 3935                            if annotation_infos_rename_list:
 3936                                annotation_infos_rename = " -c " + ",".join(
 3937                                    annotation_infos_rename_list
 3938                                )
 3939                            else:
 3940                                annotation_infos_rename = ""
 3941
 3942                            # Annotate command
 3943                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3944
 3945                            # Add command
 3946                            commands[command_annotate] = tmp_annotation_vcf_name
 3947
 3948                if commands:
 3949
 3950                    # Export VCF file
 3951                    self.export_variant_vcf(
 3952                        vcf_file=tmp_vcf_name,
 3953                        remove_info=True,
 3954                        add_samples=False,
 3955                        index=True,
 3956                    )
 3957                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3958
 3959                    # Num command
 3960                    nb_command = 0
 3961
 3962                    # Annotate
 3963                    for command_annotate in commands:
 3964                        nb_command += 1
 3965                        log.info(
 3966                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3967                        )
 3968                        log.debug(f"command_annotate={command_annotate}")
 3969                        run_parallel_commands([command_annotate], threads)
 3970
 3971                        # Debug
 3972                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3973
 3974                        # Update variants
 3975                        log.info(
 3976                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3977                        )
 3978                        self.update_from_vcf(commands[command_annotate])
 3979
 3980    def annotation_bcftools(self, threads: int = None) -> None:
 3981        """
 3982        This function annotate with bcftools
 3983
 3984        :param threads: Number of threads to use
 3985        :return: the value of the variable "return_value".
 3986        """
 3987
 3988        # DEBUG
 3989        log.debug("Start annotation with bcftools databases")
 3990
 3991        # Threads
 3992        if not threads:
 3993            threads = self.get_threads()
 3994        log.debug("Threads: " + str(threads))
 3995
 3996        # Config
 3997        config = self.get_config()
 3998        log.debug("Config: " + str(config))
 3999
 4000        # DEBUG
 4001        delete_tmp = True
 4002        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4003            delete_tmp = False
 4004            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4005
 4006        # Config - BCFTools bin command
 4007        bcftools_bin_command = get_bin_command(
 4008            bin="bcftools",
 4009            tool="bcftools",
 4010            bin_type="bin",
 4011            config=config,
 4012            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 4013        )
 4014        if not bcftools_bin_command:
 4015            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 4016            log.error(msg_err)
 4017            raise ValueError(msg_err)
 4018
 4019        # Config - BCFTools databases folders
 4020        databases_folders = set(
 4021            self.get_config()
 4022            .get("folders", {})
 4023            .get("databases", {})
 4024            .get("annotations", ["."])
 4025            + self.get_config()
 4026            .get("folders", {})
 4027            .get("databases", {})
 4028            .get("bcftools", ["."])
 4029        )
 4030        log.debug("Databases annotations: " + str(databases_folders))
 4031
 4032        # Param
 4033        annotations = (
 4034            self.get_param()
 4035            .get("annotation", {})
 4036            .get("bcftools", {})
 4037            .get("annotations", None)
 4038        )
 4039        log.debug("Annotations: " + str(annotations))
 4040
 4041        # Assembly
 4042        assembly = self.get_param().get(
 4043            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4044        )
 4045
 4046        # Data
 4047        table_variants = self.get_table_variants()
 4048
 4049        # Check if not empty
 4050        log.debug("Check if not empty")
 4051        sql_query_chromosomes = (
 4052            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4053        )
 4054        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4055        if not sql_query_chromosomes_df["count"][0]:
 4056            log.info(f"VCF empty")
 4057            return
 4058
 4059        # Export in VCF
 4060        log.debug("Create initial file to annotate")
 4061        tmp_vcf = NamedTemporaryFile(
 4062            prefix=self.get_prefix(),
 4063            dir=self.get_tmp_dir(),
 4064            suffix=".vcf.gz",
 4065            delete=False,
 4066        )
 4067        tmp_vcf_name = tmp_vcf.name
 4068
 4069        # VCF header
 4070        vcf_reader = self.get_header()
 4071        log.debug("Initial header: " + str(vcf_reader.infos))
 4072
 4073        # Existing annotations
 4074        for vcf_annotation in self.get_header().infos:
 4075
 4076            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4077            log.debug(
 4078                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4079            )
 4080
 4081        if annotations:
 4082
 4083            tmp_ann_vcf_list = []
 4084            commands = []
 4085            tmp_files = []
 4086            err_files = []
 4087
 4088            for annotation in annotations:
 4089                annotation_fields = annotations[annotation]
 4090
 4091                # Annotation Name
 4092                annotation_name = os.path.basename(annotation)
 4093
 4094                if not annotation_fields:
 4095                    annotation_fields = {"INFO": None}
 4096
 4097                log.debug(f"Annotation '{annotation_name}'")
 4098                log.debug(
 4099                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4100                )
 4101
 4102                # Create Database
 4103                database = Database(
 4104                    database=annotation,
 4105                    databases_folders=databases_folders,
 4106                    assembly=assembly,
 4107                )
 4108
 4109                # Find files
 4110                db_file = database.get_database()
 4111                db_file = full_path(db_file)
 4112                db_hdr_file = database.get_header_file()
 4113                db_hdr_file = full_path(db_hdr_file)
 4114                db_file_type = database.get_format()
 4115                db_tbi_file = f"{db_file}.tbi"
 4116                db_file_compressed = database.is_compressed()
 4117
 4118                # Check if compressed
 4119                if not db_file_compressed:
 4120                    log.error(
 4121                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4122                    )
 4123                    raise ValueError(
 4124                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4125                    )
 4126
 4127                # Check if indexed
 4128                if not os.path.exists(db_tbi_file):
 4129                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4130                    raise ValueError(
 4131                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4132                    )
 4133
 4134                # Check index - try to create if not exists
 4135                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4136                    log.error("Annotation failed: database not valid")
 4137                    log.error(f"Annotation annotation file: {db_file}")
 4138                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4139                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4140                    raise ValueError(
 4141                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4142                    )
 4143                else:
 4144
 4145                    log.debug(
 4146                        f"Annotation '{annotation}' - file: "
 4147                        + str(db_file)
 4148                        + " and "
 4149                        + str(db_hdr_file)
 4150                    )
 4151
 4152                    # Load header as VCF object
 4153                    db_hdr_vcf = Variants(input=db_hdr_file)
 4154                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4155                    log.debug(
 4156                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4157                    )
 4158
 4159                    # For all fields in database
 4160                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4161                        annotation_fields = {
 4162                            key: key for key in db_hdr_vcf_header_infos
 4163                        }
 4164                        log.debug(
 4165                            "Annotation database header - All annotations added: "
 4166                            + str(annotation_fields)
 4167                        )
 4168
 4169                    # Number of fields
 4170                    nb_annotation_field = 0
 4171                    annotation_list = []
 4172
 4173                    for annotation_field in annotation_fields:
 4174
 4175                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4176                        annotation_fields_new_name = annotation_fields.get(
 4177                            annotation_field, annotation_field
 4178                        )
 4179                        if not annotation_fields_new_name:
 4180                            annotation_fields_new_name = annotation_field
 4181
 4182                        # Check if field is in DB and if field is not elready in input data
 4183                        if (
 4184                            annotation_field in db_hdr_vcf.get_header().infos
 4185                            and annotation_fields_new_name
 4186                            not in self.get_header().infos
 4187                        ):
 4188
 4189                            log.info(
 4190                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4191                            )
 4192
 4193                            # Add INFO field to header
 4194                            db_hdr_vcf_header_infos_number = (
 4195                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4196                            )
 4197                            db_hdr_vcf_header_infos_type = (
 4198                                db_hdr_vcf_header_infos[annotation_field].type
 4199                                or "String"
 4200                            )
 4201                            db_hdr_vcf_header_infos_description = (
 4202                                db_hdr_vcf_header_infos[annotation_field].desc
 4203                                or f"{annotation_field} description"
 4204                            )
 4205                            db_hdr_vcf_header_infos_source = (
 4206                                db_hdr_vcf_header_infos[annotation_field].source
 4207                                or "unknown"
 4208                            )
 4209                            db_hdr_vcf_header_infos_version = (
 4210                                db_hdr_vcf_header_infos[annotation_field].version
 4211                                or "unknown"
 4212                            )
 4213
 4214                            vcf_reader.infos[annotation_fields_new_name] = (
 4215                                vcf.parser._Info(
 4216                                    annotation_fields_new_name,
 4217                                    db_hdr_vcf_header_infos_number,
 4218                                    db_hdr_vcf_header_infos_type,
 4219                                    db_hdr_vcf_header_infos_description,
 4220                                    db_hdr_vcf_header_infos_source,
 4221                                    db_hdr_vcf_header_infos_version,
 4222                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4223                                )
 4224                            )
 4225
 4226                            # annotation_list.append(annotation_field)
 4227                            if annotation_field != annotation_fields_new_name:
 4228                                annotation_list.append(
 4229                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4230                                )
 4231                            else:
 4232                                annotation_list.append(annotation_field)
 4233
 4234                            nb_annotation_field += 1
 4235
 4236                        else:
 4237
 4238                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4239                                log.warning(
 4240                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4241                                )
 4242                            if annotation_fields_new_name in self.get_header().infos:
 4243                                log.warning(
 4244                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4245                                )
 4246
 4247                    log.info(
 4248                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4249                    )
 4250
 4251                    annotation_infos = ",".join(annotation_list)
 4252
 4253                    if annotation_infos != "":
 4254
 4255                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4256                        log.debug("Protect Header file - remove #CHROM line if exists")
 4257                        tmp_header_vcf = NamedTemporaryFile(
 4258                            prefix=self.get_prefix(),
 4259                            dir=self.get_tmp_dir(),
 4260                            suffix=".hdr",
 4261                            delete=False,
 4262                        )
 4263                        tmp_header_vcf_name = tmp_header_vcf.name
 4264                        tmp_files.append(tmp_header_vcf_name)
 4265                        # Command
 4266                        if db_hdr_file.endswith(".gz"):
 4267                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4268                        else:
 4269                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4270                        # Run
 4271                        run_parallel_commands([command_extract_header], 1)
 4272
 4273                        # Find chomosomes
 4274                        log.debug("Find chromosomes ")
 4275                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4276                        sql_query_chromosomes_df = self.get_query_to_df(
 4277                            sql_query_chromosomes
 4278                        )
 4279                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4280
 4281                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4282
 4283                        # BED columns in the annotation file
 4284                        if db_file_type in ["bed"]:
 4285                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4286
 4287                        for chrom in chomosomes_list:
 4288
 4289                            # Create BED on initial VCF
 4290                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4291                            tmp_bed = NamedTemporaryFile(
 4292                                prefix=self.get_prefix(),
 4293                                dir=self.get_tmp_dir(),
 4294                                suffix=".bed",
 4295                                delete=False,
 4296                            )
 4297                            tmp_bed_name = tmp_bed.name
 4298                            tmp_files.append(tmp_bed_name)
 4299
 4300                            # Detecte regions
 4301                            log.debug(
 4302                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4303                            )
 4304                            window = 1000000
 4305                            sql_query_intervals_for_bed = f"""
 4306                                SELECT  \"#CHROM\",
 4307                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4308                                        \"POS\"+{window}
 4309                                FROM {table_variants} as table_variants
 4310                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4311                            """
 4312                            regions = self.conn.execute(
 4313                                sql_query_intervals_for_bed
 4314                            ).fetchall()
 4315                            merged_regions = merge_regions(regions)
 4316                            log.debug(
 4317                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4318                            )
 4319
 4320                            header = ["#CHROM", "START", "END"]
 4321                            with open(tmp_bed_name, "w") as f:
 4322                                # Write the header with tab delimiter
 4323                                f.write("\t".join(header) + "\n")
 4324                                for d in merged_regions:
 4325                                    # Write each data row with tab delimiter
 4326                                    f.write("\t".join(map(str, d)) + "\n")
 4327
 4328                            # Tmp files
 4329                            tmp_annotation_vcf = NamedTemporaryFile(
 4330                                prefix=self.get_prefix(),
 4331                                dir=self.get_tmp_dir(),
 4332                                suffix=".vcf.gz",
 4333                                delete=False,
 4334                            )
 4335                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4336                            tmp_files.append(tmp_annotation_vcf_name)
 4337                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4338                            tmp_annotation_vcf_name_err = (
 4339                                tmp_annotation_vcf_name + ".err"
 4340                            )
 4341                            err_files.append(tmp_annotation_vcf_name_err)
 4342
 4343                            # Annotate Command
 4344                            log.debug(
 4345                                f"Annotation '{annotation}' - add bcftools command"
 4346                            )
 4347
 4348                            # Command
 4349                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4350
 4351                            # Add command
 4352                            commands.append(command_annotate)
 4353
 4354            # if some commands
 4355            if commands:
 4356
 4357                # Export VCF file
 4358                self.export_variant_vcf(
 4359                    vcf_file=tmp_vcf_name,
 4360                    remove_info=True,
 4361                    add_samples=False,
 4362                    index=True,
 4363                )
 4364
 4365                # Threads
 4366                # calculate threads for annotated commands
 4367                if commands:
 4368                    threads_bcftools_annotate = round(threads / len(commands))
 4369                else:
 4370                    threads_bcftools_annotate = 1
 4371
 4372                if not threads_bcftools_annotate:
 4373                    threads_bcftools_annotate = 1
 4374
 4375                # Add threads option to bcftools commands
 4376                if threads_bcftools_annotate > 1:
 4377                    commands_threaded = []
 4378                    for command in commands:
 4379                        commands_threaded.append(
 4380                            command.replace(
 4381                                f"{bcftools_bin_command} annotate ",
 4382                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4383                            )
 4384                        )
 4385                    commands = commands_threaded
 4386
 4387                # Command annotation multithreading
 4388                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4389                log.info(
 4390                    f"Annotation - Annotation multithreaded in "
 4391                    + str(len(commands))
 4392                    + " commands"
 4393                )
 4394
 4395                run_parallel_commands(commands, threads)
 4396
 4397                # Merge
 4398                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4399
 4400                if tmp_ann_vcf_list_cmd:
 4401
 4402                    # Tmp file
 4403                    tmp_annotate_vcf = NamedTemporaryFile(
 4404                        prefix=self.get_prefix(),
 4405                        dir=self.get_tmp_dir(),
 4406                        suffix=".vcf.gz",
 4407                        delete=True,
 4408                    )
 4409                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4410                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4411                    err_files.append(tmp_annotate_vcf_name_err)
 4412
 4413                    # Tmp file remove command
 4414                    tmp_files_remove_command = ""
 4415                    if tmp_files:
 4416                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4417
 4418                    # Command merge
 4419                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4420                    log.info(
 4421                        f"Annotation - Annotation merging "
 4422                        + str(len(commands))
 4423                        + " annotated files"
 4424                    )
 4425                    log.debug(f"Annotation - merge command: {merge_command}")
 4426                    run_parallel_commands([merge_command], 1)
 4427
 4428                    # Error messages
 4429                    log.info(f"Error/Warning messages:")
 4430                    error_message_command_all = []
 4431                    error_message_command_warning = []
 4432                    error_message_command_err = []
 4433                    for err_file in err_files:
 4434                        with open(err_file, "r") as f:
 4435                            for line in f:
 4436                                message = line.strip()
 4437                                error_message_command_all.append(message)
 4438                                if line.startswith("[W::"):
 4439                                    error_message_command_warning.append(message)
 4440                                if line.startswith("[E::"):
 4441                                    error_message_command_err.append(
 4442                                        f"{err_file}: " + message
 4443                                    )
 4444                    # log info
 4445                    for message in list(
 4446                        set(error_message_command_err + error_message_command_warning)
 4447                    ):
 4448                        log.info(f"   {message}")
 4449                    # debug info
 4450                    for message in list(set(error_message_command_all)):
 4451                        log.debug(f"   {message}")
 4452                    # failed
 4453                    if len(error_message_command_err):
 4454                        log.error("Annotation failed: Error in commands")
 4455                        raise ValueError("Annotation failed: Error in commands")
 4456
 4457                    # Update variants
 4458                    log.info(f"Annotation - Updating...")
 4459                    self.update_from_vcf(tmp_annotate_vcf_name)
 4460
 4461    def annotation_exomiser(self, threads: int = None) -> None:
 4462        """
 4463        This function annotate with Exomiser
 4464
 4465        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4466        - "analysis" (dict/file):
 4467            Full analysis dictionnary parameters (see Exomiser docs).
 4468            Either a dict, or a file in JSON or YAML format.
 4469            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4470            Default : None
 4471        - "preset" (string):
 4472            Analysis preset (available in config folder).
 4473            Used if no full "analysis" is provided.
 4474            Default: "exome"
 4475        - "phenopacket" (dict/file):
 4476            Samples and phenotipic features parameters (see Exomiser docs).
 4477            Either a dict, or a file in JSON or YAML format.
 4478            Default: None
 4479        - "subject" (dict):
 4480            Sample parameters (see Exomiser docs).
 4481            Example:
 4482                "subject":
 4483                    {
 4484                        "id": "ISDBM322017",
 4485                        "sex": "FEMALE"
 4486                    }
 4487            Default: None
 4488        - "sample" (string):
 4489            Sample name to construct "subject" section:
 4490                "subject":
 4491                    {
 4492                        "id": "<sample>",
 4493                        "sex": "UNKNOWN_SEX"
 4494                    }
 4495            Default: None
 4496        - "phenotypicFeatures" (dict)
 4497            Phenotypic features to construct "subject" section.
 4498            Example:
 4499                "phenotypicFeatures":
 4500                    [
 4501                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4502                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4503                    ]
 4504        - "hpo" (list)
 4505            List of HPO ids as phenotypic features.
 4506            Example:
 4507                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4508            Default: []
 4509        - "outputOptions" (dict):
 4510            Output options (see Exomiser docs).
 4511            Default:
 4512                "output_options" =
 4513                    {
 4514                        "outputContributingVariantsOnly": False,
 4515                        "numGenes": 0,
 4516                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4517                    }
 4518        - "transcript_source" (string):
 4519            Transcript source (either "refseq", "ucsc", "ensembl")
 4520            Default: "refseq"
 4521        - "exomiser_to_info" (boolean):
 4522            Add exomiser TSV file columns as INFO fields in VCF.
 4523            Default: False
 4524        - "release" (string):
 4525            Exomise database release.
 4526            If not exists, database release will be downloaded (take a while).
 4527            Default: None (provided by application.properties configuration file)
 4528        - "exomiser_application_properties" (file):
 4529            Exomiser configuration file (see Exomiser docs).
 4530            Useful to automatically download databases (especially for specific genome databases).
 4531
 4532        Notes:
 4533        - If no sample in parameters, first sample in VCF will be chosen
 4534        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4535
 4536        :param threads: The number of threads to use
 4537        :return: None.
 4538        """
 4539
 4540        # DEBUG
 4541        log.debug("Start annotation with Exomiser databases")
 4542
 4543        # Threads
 4544        if not threads:
 4545            threads = self.get_threads()
 4546        log.debug("Threads: " + str(threads))
 4547
 4548        # Config
 4549        config = self.get_config()
 4550        log.debug("Config: " + str(config))
 4551
 4552        # Config - Folders - Databases
 4553        databases_folders = (
 4554            config.get("folders", {})
 4555            .get("databases", {})
 4556            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4557        )
 4558        databases_folders = full_path(databases_folders)
 4559        if not os.path.exists(databases_folders):
 4560            log.error(f"Databases annotations: {databases_folders} NOT found")
 4561        log.debug("Databases annotations: " + str(databases_folders))
 4562
 4563        # Config - Exomiser
 4564        exomiser_bin_command = get_bin_command(
 4565            bin="exomiser-cli*.jar",
 4566            tool="exomiser",
 4567            bin_type="jar",
 4568            config=config,
 4569            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4570        )
 4571        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4572        if not exomiser_bin_command:
 4573            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4574            log.error(msg_err)
 4575            raise ValueError(msg_err)
 4576
 4577        # Param
 4578        param = self.get_param()
 4579        log.debug("Param: " + str(param))
 4580
 4581        # Param - Exomiser
 4582        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4583        log.debug(f"Param Exomiser: {param_exomiser}")
 4584
 4585        # Param - Assembly
 4586        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4587        log.debug("Assembly: " + str(assembly))
 4588
 4589        # Data
 4590        table_variants = self.get_table_variants()
 4591
 4592        # Check if not empty
 4593        log.debug("Check if not empty")
 4594        sql_query_chromosomes = (
 4595            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4596        )
 4597        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4598            log.info(f"VCF empty")
 4599            return False
 4600
 4601        # VCF header
 4602        vcf_reader = self.get_header()
 4603        log.debug("Initial header: " + str(vcf_reader.infos))
 4604
 4605        # Samples
 4606        samples = self.get_header_sample_list()
 4607        if not samples:
 4608            log.error("No Samples in VCF")
 4609            return False
 4610        log.debug(f"Samples: {samples}")
 4611
 4612        # Memory limit
 4613        memory_limit = self.get_memory("8G")
 4614        log.debug(f"memory_limit: {memory_limit}")
 4615
 4616        # Exomiser java options
 4617        exomiser_java_options = (
 4618            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4619        )
 4620        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4621
 4622        # Download Exomiser (if not exists)
 4623        exomiser_release = param_exomiser.get("release", None)
 4624        exomiser_application_properties = param_exomiser.get(
 4625            "exomiser_application_properties", None
 4626        )
 4627        databases_download_exomiser(
 4628            assemblies=[assembly],
 4629            exomiser_folder=databases_folders,
 4630            exomiser_release=exomiser_release,
 4631            exomiser_phenotype_release=exomiser_release,
 4632            exomiser_application_properties=exomiser_application_properties,
 4633        )
 4634
 4635        # Force annotation
 4636        force_update_annotation = True
 4637
 4638        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4639            log.debug("Start annotation Exomiser")
 4640
 4641            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4642
 4643                # tmp_dir = "/tmp/exomiser"
 4644
 4645                ### ANALYSIS ###
 4646                ################
 4647
 4648                # Create analysis.json through analysis dict
 4649                # either analysis in param or by default
 4650                # depending on preset exome/genome)
 4651
 4652                # Init analysis dict
 4653                param_exomiser_analysis_dict = {}
 4654
 4655                # analysis from param
 4656                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4657                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4658
 4659                # If analysis in param -> load anlaysis json
 4660                if param_exomiser_analysis:
 4661
 4662                    # If param analysis is a file and exists
 4663                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4664                        param_exomiser_analysis
 4665                    ):
 4666                        # Load analysis file into analysis dict (either yaml or json)
 4667                        with open(param_exomiser_analysis) as json_file:
 4668                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4669
 4670                    # If param analysis is a dict
 4671                    elif isinstance(param_exomiser_analysis, dict):
 4672                        # Load analysis dict into analysis dict (either yaml or json)
 4673                        param_exomiser_analysis_dict = param_exomiser_analysis
 4674
 4675                    # Error analysis type
 4676                    else:
 4677                        log.error(f"Analysis type unknown. Check param file.")
 4678                        raise ValueError(f"Analysis type unknown. Check param file.")
 4679
 4680                # Case no input analysis config file/dict
 4681                # Use preset (exome/genome) to open default config file
 4682                if not param_exomiser_analysis_dict:
 4683
 4684                    # default preset
 4685                    default_preset = "exome"
 4686
 4687                    # Get param preset or default preset
 4688                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4689
 4690                    # Try to find if preset is a file
 4691                    if os.path.exists(param_exomiser_preset):
 4692                        # Preset file is provided in full path
 4693                        param_exomiser_analysis_default_config_file = (
 4694                            param_exomiser_preset
 4695                        )
 4696                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4697                    #     # Preset file is provided in full path
 4698                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4699                    elif os.path.exists(
 4700                        os.path.join(folder_config, param_exomiser_preset)
 4701                    ):
 4702                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4703                        param_exomiser_analysis_default_config_file = os.path.join(
 4704                            folder_config, param_exomiser_preset
 4705                        )
 4706                    else:
 4707                        # Construct preset file
 4708                        param_exomiser_analysis_default_config_file = os.path.join(
 4709                            folder_config,
 4710                            f"preset-{param_exomiser_preset}-analysis.json",
 4711                        )
 4712
 4713                    # If preset file exists
 4714                    param_exomiser_analysis_default_config_file = full_path(
 4715                        param_exomiser_analysis_default_config_file
 4716                    )
 4717                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4718                        # Load prest file into analysis dict (either yaml or json)
 4719                        with open(
 4720                            param_exomiser_analysis_default_config_file
 4721                        ) as json_file:
 4722                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4723                                json_file
 4724                            )
 4725
 4726                    # Error preset file
 4727                    else:
 4728                        log.error(
 4729                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4730                        )
 4731                        raise ValueError(
 4732                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4733                        )
 4734
 4735                # If no analysis dict created
 4736                if not param_exomiser_analysis_dict:
 4737                    log.error(f"No analysis config")
 4738                    raise ValueError(f"No analysis config")
 4739
 4740                # Log
 4741                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4742
 4743                ### PHENOPACKET ###
 4744                ###################
 4745
 4746                # If no PhenoPacket in analysis dict -> check in param
 4747                if "phenopacket" not in param_exomiser_analysis_dict:
 4748
 4749                    # If PhenoPacket in param -> load anlaysis json
 4750                    if param_exomiser.get("phenopacket", None):
 4751
 4752                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4753                        param_exomiser_phenopacket = full_path(
 4754                            param_exomiser_phenopacket
 4755                        )
 4756
 4757                        # If param phenopacket is a file and exists
 4758                        if isinstance(
 4759                            param_exomiser_phenopacket, str
 4760                        ) and os.path.exists(param_exomiser_phenopacket):
 4761                            # Load phenopacket file into analysis dict (either yaml or json)
 4762                            with open(param_exomiser_phenopacket) as json_file:
 4763                                param_exomiser_analysis_dict["phenopacket"] = (
 4764                                    yaml.safe_load(json_file)
 4765                                )
 4766
 4767                        # If param phenopacket is a dict
 4768                        elif isinstance(param_exomiser_phenopacket, dict):
 4769                            # Load phenopacket dict into analysis dict (either yaml or json)
 4770                            param_exomiser_analysis_dict["phenopacket"] = (
 4771                                param_exomiser_phenopacket
 4772                            )
 4773
 4774                        # Error phenopacket type
 4775                        else:
 4776                            log.error(f"Phenopacket type unknown. Check param file.")
 4777                            raise ValueError(
 4778                                f"Phenopacket type unknown. Check param file."
 4779                            )
 4780
 4781                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4782                if "phenopacket" not in param_exomiser_analysis_dict:
 4783
 4784                    # Init PhenoPacket
 4785                    param_exomiser_analysis_dict["phenopacket"] = {
 4786                        "id": "analysis",
 4787                        "proband": {},
 4788                    }
 4789
 4790                    ### Add subject ###
 4791
 4792                    # If subject exists
 4793                    param_exomiser_subject = param_exomiser.get("subject", {})
 4794
 4795                    # If subject not exists -> found sample ID
 4796                    if not param_exomiser_subject:
 4797
 4798                        # Found sample ID in param
 4799                        sample = param_exomiser.get("sample", None)
 4800
 4801                        # Find sample ID (first sample)
 4802                        if not sample:
 4803                            sample_list = self.get_header_sample_list()
 4804                            if len(sample_list) > 0:
 4805                                sample = sample_list[0]
 4806                            else:
 4807                                log.error(f"No sample found")
 4808                                raise ValueError(f"No sample found")
 4809
 4810                        # Create subject
 4811                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4812
 4813                    # Add to dict
 4814                    param_exomiser_analysis_dict["phenopacket"][
 4815                        "subject"
 4816                    ] = param_exomiser_subject
 4817
 4818                    ### Add "phenotypicFeatures" ###
 4819
 4820                    # If phenotypicFeatures exists
 4821                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4822                        "phenotypicFeatures", []
 4823                    )
 4824
 4825                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4826                    if not param_exomiser_phenotypicfeatures:
 4827
 4828                        # Found HPO in param
 4829                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4830
 4831                        # Split HPO if list in string format separated by comma
 4832                        if isinstance(param_exomiser_hpo, str):
 4833                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4834
 4835                        # Create HPO list
 4836                        for hpo in param_exomiser_hpo:
 4837                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4838                            param_exomiser_phenotypicfeatures.append(
 4839                                {
 4840                                    "type": {
 4841                                        "id": f"HP:{hpo_clean}",
 4842                                        "label": f"HP:{hpo_clean}",
 4843                                    }
 4844                                }
 4845                            )
 4846
 4847                    # Add to dict
 4848                    param_exomiser_analysis_dict["phenopacket"][
 4849                        "phenotypicFeatures"
 4850                    ] = param_exomiser_phenotypicfeatures
 4851
 4852                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4853                    if not param_exomiser_phenotypicfeatures:
 4854                        for step in param_exomiser_analysis_dict.get(
 4855                            "analysis", {}
 4856                        ).get("steps", []):
 4857                            if "hiPhivePrioritiser" in step:
 4858                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4859                                    "steps", []
 4860                                ).remove(step)
 4861
 4862                ### Add Input File ###
 4863
 4864                # Initial file name and htsFiles
 4865                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4866                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4867                    {
 4868                        "uri": tmp_vcf_name,
 4869                        "htsFormat": "VCF",
 4870                        "genomeAssembly": assembly,
 4871                    }
 4872                ]
 4873
 4874                ### Add metaData ###
 4875
 4876                # If metaData not in analysis dict
 4877                if "metaData" not in param_exomiser_analysis_dict:
 4878                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4879                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4880                        "createdBy": "howard",
 4881                        "phenopacketSchemaVersion": 1,
 4882                    }
 4883
 4884                ### OutputOptions ###
 4885
 4886                # Init output result folder
 4887                output_results = os.path.join(tmp_dir, "results")
 4888
 4889                # If no outputOptions in analysis dict
 4890                if "outputOptions" not in param_exomiser_analysis_dict:
 4891
 4892                    # default output formats
 4893                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4894
 4895                    # Get outputOptions in param
 4896                    output_options = param_exomiser.get("outputOptions", None)
 4897
 4898                    # If no output_options in param -> check
 4899                    if not output_options:
 4900                        output_options = {
 4901                            "outputContributingVariantsOnly": False,
 4902                            "numGenes": 0,
 4903                            "outputFormats": defaut_output_formats,
 4904                        }
 4905
 4906                    # Replace outputDirectory in output options
 4907                    output_options["outputDirectory"] = output_results
 4908                    output_options["outputFileName"] = "howard"
 4909
 4910                    # Add outputOptions in analysis dict
 4911                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4912
 4913                else:
 4914
 4915                    # Replace output_results and output format (if exists in param)
 4916                    param_exomiser_analysis_dict["outputOptions"][
 4917                        "outputDirectory"
 4918                    ] = output_results
 4919                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4920                        list(
 4921                            set(
 4922                                param_exomiser_analysis_dict.get(
 4923                                    "outputOptions", {}
 4924                                ).get("outputFormats", [])
 4925                                + ["TSV_VARIANT", "VCF"]
 4926                            )
 4927                        )
 4928                    )
 4929
 4930                # log
 4931                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4932
 4933                ### ANALYSIS FILE ###
 4934                #####################
 4935
 4936                ### Full JSON analysis config file ###
 4937
 4938                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4939                with open(exomiser_analysis, "w") as fp:
 4940                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4941
 4942                ### SPLIT analysis and sample config files
 4943
 4944                # Splitted analysis dict
 4945                param_exomiser_analysis_dict_for_split = (
 4946                    param_exomiser_analysis_dict.copy()
 4947                )
 4948
 4949                # Phenopacket JSON file
 4950                exomiser_analysis_phenopacket = os.path.join(
 4951                    tmp_dir, "analysis_phenopacket.json"
 4952                )
 4953                with open(exomiser_analysis_phenopacket, "w") as fp:
 4954                    json.dump(
 4955                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4956                        fp,
 4957                        indent=4,
 4958                    )
 4959
 4960                # Analysis JSON file without Phenopacket parameters
 4961                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4962                exomiser_analysis_analysis = os.path.join(
 4963                    tmp_dir, "analysis_analysis.json"
 4964                )
 4965                with open(exomiser_analysis_analysis, "w") as fp:
 4966                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4967
 4968                ### INITAL VCF file ###
 4969                #######################
 4970
 4971                ### Create list of samples to use and include inti initial VCF file ####
 4972
 4973                # Subject (main sample)
 4974                # Get sample ID in analysis dict
 4975                sample_subject = (
 4976                    param_exomiser_analysis_dict.get("phenopacket", {})
 4977                    .get("subject", {})
 4978                    .get("id", None)
 4979                )
 4980                sample_proband = (
 4981                    param_exomiser_analysis_dict.get("phenopacket", {})
 4982                    .get("proband", {})
 4983                    .get("subject", {})
 4984                    .get("id", None)
 4985                )
 4986                sample = []
 4987                if sample_subject:
 4988                    sample.append(sample_subject)
 4989                if sample_proband:
 4990                    sample.append(sample_proband)
 4991
 4992                # Get sample ID within Pedigree
 4993                pedigree_persons_list = (
 4994                    param_exomiser_analysis_dict.get("phenopacket", {})
 4995                    .get("pedigree", {})
 4996                    .get("persons", {})
 4997                )
 4998
 4999                # Create list with all sample ID in pedigree (if exists)
 5000                pedigree_persons = []
 5001                for person in pedigree_persons_list:
 5002                    pedigree_persons.append(person.get("individualId"))
 5003
 5004                # Concat subject sample ID and samples ID in pedigreesamples
 5005                samples = list(set(sample + pedigree_persons))
 5006
 5007                # Check if sample list is not empty
 5008                if not samples:
 5009                    log.error(f"No samples found")
 5010                    raise ValueError(f"No samples found")
 5011
 5012                # Create VCF with sample (either sample in param or first one by default)
 5013                # Export VCF file
 5014                self.export_variant_vcf(
 5015                    vcf_file=tmp_vcf_name,
 5016                    remove_info=True,
 5017                    add_samples=True,
 5018                    list_samples=samples,
 5019                    index=False,
 5020                )
 5021
 5022                ### Execute Exomiser ###
 5023                ########################
 5024
 5025                # Init command
 5026                exomiser_command = ""
 5027
 5028                # Command exomiser options
 5029                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5030
 5031                # Release
 5032                exomiser_release = param_exomiser.get("release", None)
 5033                if exomiser_release:
 5034                    # phenotype data version
 5035                    exomiser_options += (
 5036                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5037                    )
 5038                    # data version
 5039                    exomiser_options += (
 5040                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5041                    )
 5042                    # variant white list
 5043                    variant_white_list_file = (
 5044                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5045                    )
 5046                    if os.path.exists(
 5047                        os.path.join(
 5048                            databases_folders, assembly, variant_white_list_file
 5049                        )
 5050                    ):
 5051                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5052
 5053                # transcript_source
 5054                transcript_source = param_exomiser.get(
 5055                    "transcript_source", None
 5056                )  # ucsc, refseq, ensembl
 5057                if transcript_source:
 5058                    exomiser_options += (
 5059                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5060                    )
 5061
 5062                # If analysis contain proband param
 5063                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5064                    "proband", {}
 5065                ):
 5066                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5067
 5068                # If no proband (usually uniq sample)
 5069                else:
 5070                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5071
 5072                # Log
 5073                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5074
 5075                # Run command
 5076                result = subprocess.call(
 5077                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5078                )
 5079                if result:
 5080                    log.error("Exomiser command failed")
 5081                    raise ValueError("Exomiser command failed")
 5082
 5083                ### RESULTS ###
 5084                ###############
 5085
 5086                ### Annotate with TSV fields ###
 5087
 5088                # Init result tsv file
 5089                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5090
 5091                # Init result tsv file
 5092                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5093
 5094                # Parse TSV file and explode columns in INFO field
 5095                if exomiser_to_info and os.path.exists(output_results_tsv):
 5096
 5097                    # Log
 5098                    log.debug("Exomiser columns to VCF INFO field")
 5099
 5100                    # Retrieve columns and types
 5101                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5102                    output_results_tsv_df = self.get_query_to_df(query)
 5103                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5104
 5105                    # Init concat fields for update
 5106                    sql_query_update_concat_fields = []
 5107
 5108                    # Fields to avoid
 5109                    fields_to_avoid = [
 5110                        "CONTIG",
 5111                        "START",
 5112                        "END",
 5113                        "REF",
 5114                        "ALT",
 5115                        "QUAL",
 5116                        "FILTER",
 5117                        "GENOTYPE",
 5118                    ]
 5119
 5120                    # List all columns to add into header
 5121                    for header_column in output_results_tsv_columns:
 5122
 5123                        # If header column is enable
 5124                        if header_column not in fields_to_avoid:
 5125
 5126                            # Header info type
 5127                            header_info_type = "String"
 5128                            header_column_df = output_results_tsv_df[header_column]
 5129                            header_column_df_dtype = header_column_df.dtype
 5130                            if header_column_df_dtype == object:
 5131                                if (
 5132                                    pd.to_numeric(header_column_df, errors="coerce")
 5133                                    .notnull()
 5134                                    .all()
 5135                                ):
 5136                                    header_info_type = "Float"
 5137                            else:
 5138                                header_info_type = "Integer"
 5139
 5140                            # Header info
 5141                            characters_to_validate = ["-"]
 5142                            pattern = "[" + "".join(characters_to_validate) + "]"
 5143                            header_info_name = re.sub(
 5144                                pattern,
 5145                                "_",
 5146                                f"Exomiser_{header_column}".replace("#", ""),
 5147                            )
 5148                            header_info_number = "."
 5149                            header_info_description = (
 5150                                f"Exomiser {header_column} annotation"
 5151                            )
 5152                            header_info_source = "Exomiser"
 5153                            header_info_version = "unknown"
 5154                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5155                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5156                                header_info_name,
 5157                                header_info_number,
 5158                                header_info_type,
 5159                                header_info_description,
 5160                                header_info_source,
 5161                                header_info_version,
 5162                                header_info_code,
 5163                            )
 5164
 5165                            # Add field to add for update to concat fields
 5166                            sql_query_update_concat_fields.append(
 5167                                f"""
 5168                                CASE
 5169                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5170                                    THEN concat(
 5171                                        '{header_info_name}=',
 5172                                        table_parquet."{header_column}",
 5173                                        ';'
 5174                                        )
 5175
 5176                                    ELSE ''
 5177                                END
 5178                            """
 5179                            )
 5180
 5181                    # Update query
 5182                    sql_query_update = f"""
 5183                        UPDATE {table_variants} as table_variants
 5184                            SET INFO = concat(
 5185                                            CASE
 5186                                                WHEN INFO NOT IN ('', '.')
 5187                                                THEN INFO
 5188                                                ELSE ''
 5189                                            END,
 5190                                            CASE
 5191                                                WHEN table_variants.INFO NOT IN ('','.')
 5192                                                THEN ';'
 5193                                                ELSE ''
 5194                                            END,
 5195                                            (
 5196                                            SELECT 
 5197                                                concat(
 5198                                                    {",".join(sql_query_update_concat_fields)}
 5199                                                )
 5200                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5201                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5202                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5203                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5204                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5205                                            )
 5206                                        )
 5207                            ;
 5208                        """
 5209
 5210                    # Update
 5211                    self.conn.execute(sql_query_update)
 5212
 5213                ### Annotate with VCF INFO field ###
 5214
 5215                # Init result VCF file
 5216                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5217
 5218                # If VCF exists
 5219                if os.path.exists(output_results_vcf):
 5220
 5221                    # Log
 5222                    log.debug("Exomiser result VCF update variants")
 5223
 5224                    # Find Exomiser INFO field annotation in header
 5225                    with gzip.open(output_results_vcf, "rt") as f:
 5226                        header_list = self.read_vcf_header(f)
 5227                    exomiser_vcf_header = vcf.Reader(
 5228                        io.StringIO("\n".join(header_list))
 5229                    )
 5230
 5231                    # Add annotation INFO field to header
 5232                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5233
 5234                    # Update variants with VCF
 5235                    self.update_from_vcf(output_results_vcf)
 5236
 5237        return True
 5238
 5239    def annotation_snpeff(self, threads: int = None) -> None:
 5240        """
 5241        This function annotate with snpEff
 5242
 5243        :param threads: The number of threads to use
 5244        :return: the value of the variable "return_value".
 5245        """
 5246
 5247        # DEBUG
 5248        log.debug("Start annotation with snpeff databases")
 5249
 5250        # Threads
 5251        if not threads:
 5252            threads = self.get_threads()
 5253        log.debug("Threads: " + str(threads))
 5254
 5255        # DEBUG
 5256        delete_tmp = True
 5257        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5258            delete_tmp = False
 5259            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5260
 5261        # Config
 5262        config = self.get_config()
 5263        log.debug("Config: " + str(config))
 5264
 5265        # Config - Folders - Databases
 5266        databases_folders = (
 5267            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5268        )
 5269        log.debug("Databases annotations: " + str(databases_folders))
 5270
 5271        # Config - snpEff bin command
 5272        snpeff_bin_command = get_bin_command(
 5273            bin="snpEff.jar",
 5274            tool="snpeff",
 5275            bin_type="jar",
 5276            config=config,
 5277            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5278        )
 5279        if not snpeff_bin_command:
 5280            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5281            log.error(msg_err)
 5282            raise ValueError(msg_err)
 5283
 5284        # Config - snpEff databases
 5285        snpeff_databases = (
 5286            config.get("folders", {})
 5287            .get("databases", {})
 5288            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5289        )
 5290        snpeff_databases = full_path(snpeff_databases)
 5291        if snpeff_databases is not None and snpeff_databases != "":
 5292            log.debug(f"Create snpEff databases folder")
 5293            if not os.path.exists(snpeff_databases):
 5294                os.makedirs(snpeff_databases)
 5295
 5296        # Param
 5297        param = self.get_param()
 5298        log.debug("Param: " + str(param))
 5299
 5300        # Param
 5301        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5302        log.debug("Options: " + str(options))
 5303
 5304        # Param - Assembly
 5305        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5306
 5307        # Param - Options
 5308        snpeff_options = (
 5309            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5310        )
 5311        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5312        snpeff_csvstats = (
 5313            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5314        )
 5315        if snpeff_stats:
 5316            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5317            snpeff_stats = full_path(snpeff_stats)
 5318            snpeff_options += f" -stats {snpeff_stats}"
 5319        if snpeff_csvstats:
 5320            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5321            snpeff_csvstats = full_path(snpeff_csvstats)
 5322            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5323
 5324        # Data
 5325        table_variants = self.get_table_variants()
 5326
 5327        # Check if not empty
 5328        log.debug("Check if not empty")
 5329        sql_query_chromosomes = (
 5330            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5331        )
 5332        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5333        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5334            log.info(f"VCF empty")
 5335            return
 5336
 5337        # Export in VCF
 5338        log.debug("Create initial file to annotate")
 5339        tmp_vcf = NamedTemporaryFile(
 5340            prefix=self.get_prefix(),
 5341            dir=self.get_tmp_dir(),
 5342            suffix=".vcf.gz",
 5343            delete=True,
 5344        )
 5345        tmp_vcf_name = tmp_vcf.name
 5346
 5347        # VCF header
 5348        vcf_reader = self.get_header()
 5349        log.debug("Initial header: " + str(vcf_reader.infos))
 5350
 5351        # Existing annotations
 5352        for vcf_annotation in self.get_header().infos:
 5353
 5354            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5355            log.debug(
 5356                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5357            )
 5358
 5359        # Memory limit
 5360        # if config.get("memory", None):
 5361        #     memory_limit = config.get("memory", "8G")
 5362        # else:
 5363        #     memory_limit = "8G"
 5364        memory_limit = self.get_memory("8G")
 5365        log.debug(f"memory_limit: {memory_limit}")
 5366
 5367        # snpEff java options
 5368        snpeff_java_options = (
 5369            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5370        )
 5371        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5372
 5373        force_update_annotation = True
 5374
 5375        if "ANN" not in self.get_header().infos or force_update_annotation:
 5376
 5377            # Check snpEff database
 5378            log.debug(f"Check snpEff databases {[assembly]}")
 5379            databases_download_snpeff(
 5380                folder=snpeff_databases, assemblies=[assembly], config=config
 5381            )
 5382
 5383            # Export VCF file
 5384            self.export_variant_vcf(
 5385                vcf_file=tmp_vcf_name,
 5386                remove_info=True,
 5387                add_samples=False,
 5388                index=True,
 5389            )
 5390
 5391            # Tmp file
 5392            err_files = []
 5393            tmp_annotate_vcf = NamedTemporaryFile(
 5394                prefix=self.get_prefix(),
 5395                dir=self.get_tmp_dir(),
 5396                suffix=".vcf",
 5397                delete=False,
 5398            )
 5399            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5400            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5401            err_files.append(tmp_annotate_vcf_name_err)
 5402
 5403            # Command
 5404            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5405            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5406            run_parallel_commands([snpeff_command], 1)
 5407
 5408            # Error messages
 5409            log.info(f"Error/Warning messages:")
 5410            error_message_command_all = []
 5411            error_message_command_warning = []
 5412            error_message_command_err = []
 5413            for err_file in err_files:
 5414                with open(err_file, "r") as f:
 5415                    for line in f:
 5416                        message = line.strip()
 5417                        error_message_command_all.append(message)
 5418                        if line.startswith("[W::"):
 5419                            error_message_command_warning.append(message)
 5420                        if line.startswith("[E::"):
 5421                            error_message_command_err.append(f"{err_file}: " + message)
 5422            # log info
 5423            for message in list(
 5424                set(error_message_command_err + error_message_command_warning)
 5425            ):
 5426                log.info(f"   {message}")
 5427            # debug info
 5428            for message in list(set(error_message_command_all)):
 5429                log.debug(f"   {message}")
 5430            # failed
 5431            if len(error_message_command_err):
 5432                log.error("Annotation failed: Error in commands")
 5433                raise ValueError("Annotation failed: Error in commands")
 5434
 5435            # Find annotation in header
 5436            with open(tmp_annotate_vcf_name, "rt") as f:
 5437                header_list = self.read_vcf_header(f)
 5438            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5439
 5440            for ann in annovar_vcf_header.infos:
 5441                if ann not in self.get_header().infos:
 5442                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5443
 5444            # Update variants
 5445            log.info(f"Annotation - Updating...")
 5446            self.update_from_vcf(tmp_annotate_vcf_name)
 5447
 5448        else:
 5449            if "ANN" in self.get_header().infos:
 5450                log.debug(f"Existing snpEff annotations in VCF")
 5451            if force_update_annotation:
 5452                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5453
 5454    def annotation_annovar(self, threads: int = None) -> None:
 5455        """
 5456        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5457        annotations
 5458
 5459        :param threads: number of threads to use
 5460        :return: the value of the variable "return_value".
 5461        """
 5462
 5463        # DEBUG
 5464        log.debug("Start annotation with Annovar databases")
 5465
 5466        # Threads
 5467        if not threads:
 5468            threads = self.get_threads()
 5469        log.debug("Threads: " + str(threads))
 5470
 5471        # Tmp en Err files
 5472        tmp_files = []
 5473        err_files = []
 5474
 5475        # DEBUG
 5476        delete_tmp = True
 5477        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5478            delete_tmp = False
 5479            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5480
 5481        # Config
 5482        config = self.get_config()
 5483        log.debug("Config: " + str(config))
 5484
 5485        # Config - Folders - Databases
 5486        databases_folders = (
 5487            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5488        )
 5489        log.debug("Databases annotations: " + str(databases_folders))
 5490
 5491        # Config - annovar bin command
 5492        annovar_bin_command = get_bin_command(
 5493            bin="table_annovar.pl",
 5494            tool="annovar",
 5495            bin_type="perl",
 5496            config=config,
 5497            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5498        )
 5499        if not annovar_bin_command:
 5500            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5501            log.error(msg_err)
 5502            raise ValueError(msg_err)
 5503
 5504        # Config - BCFTools bin command
 5505        bcftools_bin_command = get_bin_command(
 5506            bin="bcftools",
 5507            tool="bcftools",
 5508            bin_type="bin",
 5509            config=config,
 5510            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5511        )
 5512        if not bcftools_bin_command:
 5513            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5514            log.error(msg_err)
 5515            raise ValueError(msg_err)
 5516
 5517        # Config - annovar databases
 5518        annovar_databases = (
 5519            config.get("folders", {})
 5520            .get("databases", {})
 5521            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5522        )
 5523        if annovar_databases is not None:
 5524            if isinstance(annovar_databases, list):
 5525                annovar_databases = full_path(annovar_databases[0])
 5526                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5527            annovar_databases = full_path(annovar_databases)
 5528            if not os.path.exists(annovar_databases):
 5529                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5530                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5531        else:
 5532            msg_err = f"Annovar databases configuration failed"
 5533            log.error(msg_err)
 5534            raise ValueError(msg_err)
 5535
 5536        # Param
 5537        param = self.get_param()
 5538        log.debug("Param: " + str(param))
 5539
 5540        # Param - options
 5541        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5542        log.debug("Options: " + str(options))
 5543
 5544        # Param - annotations
 5545        annotations = (
 5546            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5547        )
 5548        log.debug("Annotations: " + str(annotations))
 5549
 5550        # Param - Assembly
 5551        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5552
 5553        # Annovar database assembly
 5554        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5555        if annovar_databases_assembly != "" and not os.path.exists(
 5556            annovar_databases_assembly
 5557        ):
 5558            os.makedirs(annovar_databases_assembly)
 5559
 5560        # Data
 5561        table_variants = self.get_table_variants()
 5562
 5563        # Check if not empty
 5564        log.debug("Check if not empty")
 5565        sql_query_chromosomes = (
 5566            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5567        )
 5568        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5569        if not sql_query_chromosomes_df["count"][0]:
 5570            log.info(f"VCF empty")
 5571            return
 5572
 5573        # VCF header
 5574        vcf_reader = self.get_header()
 5575        log.debug("Initial header: " + str(vcf_reader.infos))
 5576
 5577        # Existing annotations
 5578        for vcf_annotation in self.get_header().infos:
 5579
 5580            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5581            log.debug(
 5582                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5583            )
 5584
 5585        force_update_annotation = True
 5586
 5587        if annotations:
 5588
 5589            commands = []
 5590            tmp_annotates_vcf_name_list = []
 5591
 5592            # Export in VCF
 5593            log.debug("Create initial file to annotate")
 5594            tmp_vcf = NamedTemporaryFile(
 5595                prefix=self.get_prefix(),
 5596                dir=self.get_tmp_dir(),
 5597                suffix=".vcf.gz",
 5598                delete=False,
 5599            )
 5600            tmp_vcf_name = tmp_vcf.name
 5601            tmp_files.append(tmp_vcf_name)
 5602            tmp_files.append(tmp_vcf_name + ".tbi")
 5603
 5604            # Export VCF file
 5605            self.export_variant_vcf(
 5606                vcf_file=tmp_vcf_name,
 5607                remove_info=".",
 5608                add_samples=False,
 5609                index=True,
 5610            )
 5611
 5612            # Create file for field rename
 5613            log.debug("Create file for field rename")
 5614            tmp_rename = NamedTemporaryFile(
 5615                prefix=self.get_prefix(),
 5616                dir=self.get_tmp_dir(),
 5617                suffix=".rename",
 5618                delete=False,
 5619            )
 5620            tmp_rename_name = tmp_rename.name
 5621            tmp_files.append(tmp_rename_name)
 5622
 5623            # Check Annovar database
 5624            log.debug(
 5625                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5626            )
 5627            databases_download_annovar(
 5628                folder=annovar_databases,
 5629                files=list(annotations.keys()),
 5630                assemblies=[assembly],
 5631            )
 5632
 5633            for annotation in annotations:
 5634                annotation_fields = annotations[annotation]
 5635
 5636                if not annotation_fields:
 5637                    annotation_fields = {"INFO": None}
 5638
 5639                log.info(f"Annotations Annovar - database '{annotation}'")
 5640                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5641
 5642                # Tmp file for annovar
 5643                err_files = []
 5644                tmp_annotate_vcf_directory = TemporaryDirectory(
 5645                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5646                )
 5647                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5648                tmp_annotate_vcf_name_annovar = (
 5649                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5650                )
 5651                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5652                err_files.append(tmp_annotate_vcf_name_err)
 5653                tmp_files.append(tmp_annotate_vcf_name_err)
 5654
 5655                # Tmp file final vcf annotated by annovar
 5656                tmp_annotate_vcf = NamedTemporaryFile(
 5657                    prefix=self.get_prefix(),
 5658                    dir=self.get_tmp_dir(),
 5659                    suffix=".vcf.gz",
 5660                    delete=False,
 5661                )
 5662                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5663                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5664                tmp_files.append(tmp_annotate_vcf_name)
 5665                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5666
 5667                # Number of fields
 5668                annotation_list = []
 5669                annotation_renamed_list = []
 5670
 5671                for annotation_field in annotation_fields:
 5672
 5673                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5674                    annotation_fields_new_name = annotation_fields.get(
 5675                        annotation_field, annotation_field
 5676                    )
 5677                    if not annotation_fields_new_name:
 5678                        annotation_fields_new_name = annotation_field
 5679
 5680                    if (
 5681                        force_update_annotation
 5682                        or annotation_fields_new_name not in self.get_header().infos
 5683                    ):
 5684                        annotation_list.append(annotation_field)
 5685                        annotation_renamed_list.append(annotation_fields_new_name)
 5686                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5687                        log.warning(
 5688                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5689                        )
 5690
 5691                    # Add rename info
 5692                    run_parallel_commands(
 5693                        [
 5694                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5695                        ],
 5696                        1,
 5697                    )
 5698
 5699                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5700                log.debug("annotation_list: " + str(annotation_list))
 5701
 5702                # protocol
 5703                protocol = annotation
 5704
 5705                # argument
 5706                argument = ""
 5707
 5708                # operation
 5709                operation = "f"
 5710                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5711                    "ensGene"
 5712                ):
 5713                    operation = "g"
 5714                    if options.get("genebase", None):
 5715                        argument = f"""'{options.get("genebase","")}'"""
 5716                elif annotation in ["cytoBand"]:
 5717                    operation = "r"
 5718
 5719                # argument option
 5720                argument_option = ""
 5721                if argument != "":
 5722                    argument_option = " --argument " + argument
 5723
 5724                # command options
 5725                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5726                for option in options:
 5727                    if option not in ["genebase"]:
 5728                        command_options += f""" --{option}={options[option]}"""
 5729
 5730                # Command
 5731
 5732                # Command - Annovar
 5733                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5734                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5735
 5736                # Command - start pipe
 5737                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5738
 5739                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5740                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5741
 5742                # Command - Special characters (refGene annotation)
 5743                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5744
 5745                # Command - Clean empty fields (with value ".")
 5746                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5747
 5748                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5749                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5750                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5751                    # for ann in annotation_renamed_list:
 5752                    for ann in annotation_list:
 5753                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5754
 5755                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5756
 5757                # Command - indexing
 5758                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5759
 5760                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5761                run_parallel_commands([command_annovar], 1)
 5762
 5763                # Error messages
 5764                log.info(f"Error/Warning messages:")
 5765                error_message_command_all = []
 5766                error_message_command_warning = []
 5767                error_message_command_err = []
 5768                for err_file in err_files:
 5769                    with open(err_file, "r") as f:
 5770                        for line in f:
 5771                            message = line.strip()
 5772                            error_message_command_all.append(message)
 5773                            if line.startswith("[W::") or line.startswith("WARNING"):
 5774                                error_message_command_warning.append(message)
 5775                            if line.startswith("[E::") or line.startswith("ERROR"):
 5776                                error_message_command_err.append(
 5777                                    f"{err_file}: " + message
 5778                                )
 5779                # log info
 5780                for message in list(
 5781                    set(error_message_command_err + error_message_command_warning)
 5782                ):
 5783                    log.info(f"   {message}")
 5784                # debug info
 5785                for message in list(set(error_message_command_all)):
 5786                    log.debug(f"   {message}")
 5787                # failed
 5788                if len(error_message_command_err):
 5789                    log.error("Annotation failed: Error in commands")
 5790                    raise ValueError("Annotation failed: Error in commands")
 5791
 5792            if tmp_annotates_vcf_name_list:
 5793
 5794                # List of annotated files
 5795                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5796
 5797                # Tmp file
 5798                tmp_annotate_vcf = NamedTemporaryFile(
 5799                    prefix=self.get_prefix(),
 5800                    dir=self.get_tmp_dir(),
 5801                    suffix=".vcf.gz",
 5802                    delete=False,
 5803                )
 5804                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5805                tmp_files.append(tmp_annotate_vcf_name)
 5806                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5807                err_files.append(tmp_annotate_vcf_name_err)
 5808                tmp_files.append(tmp_annotate_vcf_name_err)
 5809
 5810                # Command merge
 5811                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5812                log.info(
 5813                    f"Annotation Annovar - Annotation merging "
 5814                    + str(len(tmp_annotates_vcf_name_list))
 5815                    + " annotated files"
 5816                )
 5817                log.debug(f"Annotation - merge command: {merge_command}")
 5818                run_parallel_commands([merge_command], 1)
 5819
 5820                # Find annotation in header
 5821                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5822                    header_list = self.read_vcf_header(f)
 5823                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5824
 5825                for ann in annovar_vcf_header.infos:
 5826                    if ann not in self.get_header().infos:
 5827                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5828
 5829                # Update variants
 5830                log.info(f"Annotation Annovar - Updating...")
 5831                self.update_from_vcf(tmp_annotate_vcf_name)
 5832
 5833            # Clean files
 5834            # Tmp file remove command
 5835            if True:
 5836                tmp_files_remove_command = ""
 5837                if tmp_files:
 5838                    tmp_files_remove_command = " ".join(tmp_files)
 5839                clean_command = f" rm -f {tmp_files_remove_command} "
 5840                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5841                log.debug(f"Annotation - cleaning command: {clean_command}")
 5842                run_parallel_commands([clean_command], 1)
 5843
 5844    # Parquet
 5845    def annotation_parquet(self, threads: int = None) -> None:
 5846        """
 5847        It takes a VCF file, and annotates it with a parquet file
 5848
 5849        :param threads: number of threads to use for the annotation
 5850        :return: the value of the variable "result".
 5851        """
 5852
 5853        # DEBUG
 5854        log.debug("Start annotation with parquet databases")
 5855
 5856        # Threads
 5857        if not threads:
 5858            threads = self.get_threads()
 5859        log.debug("Threads: " + str(threads))
 5860
 5861        # DEBUG
 5862        delete_tmp = True
 5863        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5864            delete_tmp = False
 5865            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5866
 5867        # Config
 5868        databases_folders = set(
 5869            self.get_config()
 5870            .get("folders", {})
 5871            .get("databases", {})
 5872            .get("annotations", ["."])
 5873            + self.get_config()
 5874            .get("folders", {})
 5875            .get("databases", {})
 5876            .get("parquet", ["."])
 5877        )
 5878        log.debug("Databases annotations: " + str(databases_folders))
 5879
 5880        # Param
 5881        annotations = (
 5882            self.get_param()
 5883            .get("annotation", {})
 5884            .get("parquet", {})
 5885            .get("annotations", None)
 5886        )
 5887        log.debug("Annotations: " + str(annotations))
 5888
 5889        # Assembly
 5890        assembly = self.get_param().get(
 5891            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5892        )
 5893
 5894        # Force Update Annotation
 5895        force_update_annotation = (
 5896            self.get_param()
 5897            .get("annotation", {})
 5898            .get("options", {})
 5899            .get("annotations_update", False)
 5900        )
 5901        log.debug(f"force_update_annotation={force_update_annotation}")
 5902        force_append_annotation = (
 5903            self.get_param()
 5904            .get("annotation", {})
 5905            .get("options", {})
 5906            .get("annotations_append", False)
 5907        )
 5908        log.debug(f"force_append_annotation={force_append_annotation}")
 5909
 5910        # Data
 5911        table_variants = self.get_table_variants()
 5912
 5913        # Check if not empty
 5914        log.debug("Check if not empty")
 5915        sql_query_chromosomes_df = self.get_query_to_df(
 5916            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5917        )
 5918        if not sql_query_chromosomes_df["count"][0]:
 5919            log.info(f"VCF empty")
 5920            return
 5921
 5922        # VCF header
 5923        vcf_reader = self.get_header()
 5924        log.debug("Initial header: " + str(vcf_reader.infos))
 5925
 5926        # Nb Variants POS
 5927        log.debug("NB Variants Start")
 5928        nb_variants = self.conn.execute(
 5929            f"SELECT count(*) AS count FROM variants"
 5930        ).fetchdf()["count"][0]
 5931        log.debug("NB Variants Stop")
 5932
 5933        # Existing annotations
 5934        for vcf_annotation in self.get_header().infos:
 5935
 5936            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5937            log.debug(
 5938                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5939            )
 5940
 5941        # Added columns
 5942        added_columns = []
 5943
 5944        # drop indexes
 5945        log.debug(f"Drop indexes...")
 5946        self.drop_indexes()
 5947
 5948        if annotations:
 5949
 5950            if "ALL" in annotations:
 5951
 5952                all_param = annotations.get("ALL", {})
 5953                all_param_formats = all_param.get("formats", None)
 5954                all_param_releases = all_param.get("releases", None)
 5955
 5956                databases_infos_dict = self.scan_databases(
 5957                    database_formats=all_param_formats,
 5958                    database_releases=all_param_releases,
 5959                )
 5960                for database_infos in databases_infos_dict.keys():
 5961                    if database_infos not in annotations:
 5962                        annotations[database_infos] = {"INFO": None}
 5963
 5964            for annotation in annotations:
 5965
 5966                if annotation in ["ALL"]:
 5967                    continue
 5968
 5969                # Annotation Name
 5970                annotation_name = os.path.basename(annotation)
 5971
 5972                # Annotation fields
 5973                annotation_fields = annotations[annotation]
 5974                if not annotation_fields:
 5975                    annotation_fields = {"INFO": None}
 5976
 5977                log.debug(f"Annotation '{annotation_name}'")
 5978                log.debug(
 5979                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5980                )
 5981
 5982                # Create Database
 5983                database = Database(
 5984                    database=annotation,
 5985                    databases_folders=databases_folders,
 5986                    assembly=assembly,
 5987                )
 5988
 5989                # Find files
 5990                parquet_file = database.get_database()
 5991                parquet_hdr_file = database.get_header_file()
 5992                parquet_type = database.get_type()
 5993
 5994                # Check if files exists
 5995                if not parquet_file or not parquet_hdr_file:
 5996                    msg_err_list = []
 5997                    if not parquet_file:
 5998                        msg_err_list.append(
 5999                            f"Annotation failed: Annotation file not found"
 6000                        )
 6001                    if parquet_file and not parquet_hdr_file:
 6002                        msg_err_list.append(
 6003                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 6004                        )
 6005
 6006                    log.error(". ".join(msg_err_list))
 6007                    raise ValueError(". ".join(msg_err_list))
 6008                else:
 6009                    # Get parquet connexion
 6010                    parquet_sql_attach = database.get_sql_database_attach(
 6011                        output="query"
 6012                    )
 6013                    if parquet_sql_attach:
 6014                        self.conn.execute(parquet_sql_attach)
 6015                    parquet_file_link = database.get_sql_database_link()
 6016                    # Log
 6017                    log.debug(
 6018                        f"Annotation '{annotation_name}' - file: "
 6019                        + str(parquet_file)
 6020                        + " and "
 6021                        + str(parquet_hdr_file)
 6022                    )
 6023
 6024                    # Database full header columns
 6025                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6026                        parquet_hdr_file
 6027                    )
 6028                    # Log
 6029                    log.debug(
 6030                        "Annotation database header columns : "
 6031                        + str(parquet_hdr_vcf_header_columns)
 6032                    )
 6033
 6034                    # Load header as VCF object
 6035                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6036                    # Log
 6037                    log.debug(
 6038                        "Annotation database header: "
 6039                        + str(parquet_hdr_vcf_header_infos)
 6040                    )
 6041
 6042                    # Get extra infos
 6043                    parquet_columns = database.get_extra_columns()
 6044                    # Log
 6045                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6046
 6047                    # Add extra columns if "ALL" in annotation_fields
 6048                    # if "ALL" in annotation_fields:
 6049                    #     allow_add_extra_column = True
 6050                    if "ALL" in annotation_fields and database.get_extra_columns():
 6051                        for extra_column in database.get_extra_columns():
 6052                            if (
 6053                                extra_column not in annotation_fields
 6054                                and extra_column.replace("INFO/", "")
 6055                                not in parquet_hdr_vcf_header_infos
 6056                            ):
 6057                                parquet_hdr_vcf_header_infos[extra_column] = (
 6058                                    vcf.parser._Info(
 6059                                        extra_column,
 6060                                        ".",
 6061                                        "String",
 6062                                        f"{extra_column} description",
 6063                                        "unknown",
 6064                                        "unknown",
 6065                                        self.code_type_map["String"],
 6066                                    )
 6067                                )
 6068
 6069                    # For all fields in database
 6070                    annotation_fields_all = False
 6071                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6072                        annotation_fields_all = True
 6073                        annotation_fields = {
 6074                            key: key for key in parquet_hdr_vcf_header_infos
 6075                        }
 6076
 6077                        log.debug(
 6078                            "Annotation database header - All annotations added: "
 6079                            + str(annotation_fields)
 6080                        )
 6081
 6082                    # Init
 6083
 6084                    # List of annotation fields to use
 6085                    sql_query_annotation_update_info_sets = []
 6086
 6087                    # List of annotation to agregate
 6088                    sql_query_annotation_to_agregate = []
 6089
 6090                    # Number of fields
 6091                    nb_annotation_field = 0
 6092
 6093                    # Annotation fields processed
 6094                    annotation_fields_processed = []
 6095
 6096                    # Columns mapping
 6097                    map_columns = database.map_columns(
 6098                        columns=annotation_fields, prefixes=["INFO/"]
 6099                    )
 6100
 6101                    # Query dict for fields to remove (update option)
 6102                    query_dict_remove = {}
 6103
 6104                    # Fetch Anotation fields
 6105                    for annotation_field in annotation_fields:
 6106
 6107                        # annotation_field_column
 6108                        annotation_field_column = map_columns.get(
 6109                            annotation_field, "INFO"
 6110                        )
 6111
 6112                        # field new name, if parametered
 6113                        annotation_fields_new_name = annotation_fields.get(
 6114                            annotation_field, annotation_field
 6115                        )
 6116                        if not annotation_fields_new_name:
 6117                            annotation_fields_new_name = annotation_field
 6118
 6119                        # To annotate
 6120                        # force_update_annotation = True
 6121                        # force_append_annotation = True
 6122                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6123                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6124                            force_update_annotation
 6125                            or force_append_annotation
 6126                            or (
 6127                                annotation_fields_new_name
 6128                                not in self.get_header().infos
 6129                            )
 6130                        ):
 6131
 6132                            # Add field to annotation to process list
 6133                            annotation_fields_processed.append(
 6134                                annotation_fields_new_name
 6135                            )
 6136
 6137                            # explode infos for the field
 6138                            annotation_fields_new_name_info_msg = ""
 6139                            if (
 6140                                force_update_annotation
 6141                                and annotation_fields_new_name
 6142                                in self.get_header().infos
 6143                            ):
 6144                                # Remove field from INFO
 6145                                query = f"""
 6146                                    UPDATE {table_variants} as table_variants
 6147                                    SET INFO = REGEXP_REPLACE(
 6148                                                concat(table_variants.INFO,''),
 6149                                                ';*{annotation_fields_new_name}=[^;]*',
 6150                                                ''
 6151                                                )
 6152                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6153                                """
 6154                                annotation_fields_new_name_info_msg = " [update]"
 6155                                query_dict_remove[
 6156                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6157                                ] = query
 6158
 6159                            # Sep between fields in INFO
 6160                            nb_annotation_field += 1
 6161                            if nb_annotation_field > 1:
 6162                                annotation_field_sep = ";"
 6163                            else:
 6164                                annotation_field_sep = ""
 6165
 6166                            log.info(
 6167                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6168                            )
 6169
 6170                            # Add INFO field to header
 6171                            parquet_hdr_vcf_header_infos_number = (
 6172                                parquet_hdr_vcf_header_infos[annotation_field].num
 6173                                or "."
 6174                            )
 6175                            parquet_hdr_vcf_header_infos_type = (
 6176                                parquet_hdr_vcf_header_infos[annotation_field].type
 6177                                or "String"
 6178                            )
 6179                            parquet_hdr_vcf_header_infos_description = (
 6180                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6181                                or f"{annotation_field} description"
 6182                            )
 6183                            parquet_hdr_vcf_header_infos_source = (
 6184                                parquet_hdr_vcf_header_infos[annotation_field].source
 6185                                or "unknown"
 6186                            )
 6187                            parquet_hdr_vcf_header_infos_version = (
 6188                                parquet_hdr_vcf_header_infos[annotation_field].version
 6189                                or "unknown"
 6190                            )
 6191
 6192                            vcf_reader.infos[annotation_fields_new_name] = (
 6193                                vcf.parser._Info(
 6194                                    annotation_fields_new_name,
 6195                                    parquet_hdr_vcf_header_infos_number,
 6196                                    parquet_hdr_vcf_header_infos_type,
 6197                                    parquet_hdr_vcf_header_infos_description,
 6198                                    parquet_hdr_vcf_header_infos_source,
 6199                                    parquet_hdr_vcf_header_infos_version,
 6200                                    self.code_type_map[
 6201                                        parquet_hdr_vcf_header_infos_type
 6202                                    ],
 6203                                )
 6204                            )
 6205
 6206                            # Append
 6207                            if force_append_annotation:
 6208                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6209                            else:
 6210                                query_case_when_append = ""
 6211
 6212                            # Annotation/Update query fields
 6213                            # Found in INFO column
 6214                            if (
 6215                                annotation_field_column == "INFO"
 6216                                and "INFO" in parquet_hdr_vcf_header_columns
 6217                            ):
 6218                                sql_query_annotation_update_info_sets.append(
 6219                                    f"""
 6220                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6221                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6222                                        ELSE ''
 6223                                    END
 6224                                """
 6225                                )
 6226                            # Found in a specific column
 6227                            else:
 6228                                sql_query_annotation_update_info_sets.append(
 6229                                    f"""
 6230                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6231                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6232                                        ELSE ''
 6233                                    END
 6234                                """
 6235                                )
 6236                                sql_query_annotation_to_agregate.append(
 6237                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6238                                )
 6239
 6240                        # Not to annotate
 6241                        else:
 6242
 6243                            if force_update_annotation:
 6244                                annotation_message = "forced"
 6245                            else:
 6246                                annotation_message = "skipped"
 6247
 6248                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6249                                log.warning(
 6250                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6251                                )
 6252                            if annotation_fields_new_name in self.get_header().infos:
 6253                                log.warning(
 6254                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6255                                )
 6256
 6257                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6258                    # allow_annotation_full_info = True
 6259                    allow_annotation_full_info = not force_append_annotation
 6260
 6261                    if parquet_type in ["regions"]:
 6262                        allow_annotation_full_info = False
 6263
 6264                    if (
 6265                        allow_annotation_full_info
 6266                        and nb_annotation_field == len(annotation_fields)
 6267                        and annotation_fields_all
 6268                        and (
 6269                            "INFO" in parquet_hdr_vcf_header_columns
 6270                            and "INFO" in database.get_extra_columns()
 6271                        )
 6272                    ):
 6273                        log.debug("Column INFO annotation enabled")
 6274                        sql_query_annotation_update_info_sets = []
 6275                        sql_query_annotation_update_info_sets.append(
 6276                            f" table_parquet.INFO "
 6277                        )
 6278
 6279                    if sql_query_annotation_update_info_sets:
 6280
 6281                        # Annotate
 6282                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6283
 6284                        # Join query annotation update info sets for SQL
 6285                        sql_query_annotation_update_info_sets_sql = ",".join(
 6286                            sql_query_annotation_update_info_sets
 6287                        )
 6288
 6289                        # Check chromosomes list (and variants infos)
 6290                        sql_query_chromosomes = f"""
 6291                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6292                            FROM {table_variants} as table_variants
 6293                            GROUP BY table_variants."#CHROM"
 6294                            ORDER BY table_variants."#CHROM"
 6295                            """
 6296                        sql_query_chromosomes_df = self.conn.execute(
 6297                            sql_query_chromosomes
 6298                        ).df()
 6299                        sql_query_chromosomes_dict = {
 6300                            entry["CHROM"]: {
 6301                                "count": entry["count_variants"],
 6302                                "min": entry["min_variants"],
 6303                                "max": entry["max_variants"],
 6304                            }
 6305                            for index, entry in sql_query_chromosomes_df.iterrows()
 6306                        }
 6307
 6308                        # Init
 6309                        nb_of_query = 0
 6310                        nb_of_variant_annotated = 0
 6311                        query_dict = query_dict_remove
 6312
 6313                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6314                        for chrom in sql_query_chromosomes_dict:
 6315
 6316                            # Number of variant by chromosome
 6317                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6318                                chrom, {}
 6319                            ).get("count", 0)
 6320
 6321                            log.debug(
 6322                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6323                            )
 6324
 6325                            # Annotation with regions database
 6326                            if parquet_type in ["regions"]:
 6327                                sql_query_annotation_from_clause = f"""
 6328                                    FROM (
 6329                                        SELECT 
 6330                                            '{chrom}' AS \"#CHROM\",
 6331                                            table_variants_from.\"POS\" AS \"POS\",
 6332                                            {",".join(sql_query_annotation_to_agregate)}
 6333                                        FROM {table_variants} as table_variants_from
 6334                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6335                                            table_parquet_from."#CHROM" = '{chrom}'
 6336                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6337                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6338                                        )
 6339                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6340                                        GROUP BY table_variants_from.\"POS\"
 6341                                        )
 6342                                        as table_parquet
 6343                                """
 6344
 6345                                sql_query_annotation_where_clause = """
 6346                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6347                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6348                                """
 6349
 6350                            # Annotation with variants database
 6351                            else:
 6352                                sql_query_annotation_from_clause = f"""
 6353                                    FROM {parquet_file_link} as table_parquet
 6354                                """
 6355                                sql_query_annotation_where_clause = f"""
 6356                                    table_variants."#CHROM" = '{chrom}'
 6357                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6358                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6359                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6360                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6361                                """
 6362
 6363                            # Create update query
 6364                            sql_query_annotation_chrom_interval_pos = f"""
 6365                                UPDATE {table_variants} as table_variants
 6366                                    SET INFO = 
 6367                                        concat(
 6368                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6369                                                THEN table_variants.INFO
 6370                                                ELSE ''
 6371                                            END
 6372                                            ,
 6373                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6374                                                        AND (
 6375                                                        concat({sql_query_annotation_update_info_sets_sql})
 6376                                                        )
 6377                                                        NOT IN ('','.') 
 6378                                                    THEN ';'
 6379                                                    ELSE ''
 6380                                            END
 6381                                            ,
 6382                                            {sql_query_annotation_update_info_sets_sql}
 6383                                            )
 6384                                    {sql_query_annotation_from_clause}
 6385                                    WHERE {sql_query_annotation_where_clause}
 6386                                    ;
 6387                                """
 6388
 6389                            # Add update query to dict
 6390                            query_dict[
 6391                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6392                            ] = sql_query_annotation_chrom_interval_pos
 6393
 6394                        nb_of_query = len(query_dict)
 6395                        num_query = 0
 6396
 6397                        # SET max_expression_depth TO x
 6398                        self.conn.execute("SET max_expression_depth TO 10000")
 6399
 6400                        for query_name in query_dict:
 6401                            query = query_dict[query_name]
 6402                            num_query += 1
 6403                            log.info(
 6404                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6405                            )
 6406                            result = self.conn.execute(query)
 6407                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6408                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6409                            log.info(
 6410                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6411                            )
 6412
 6413                        log.info(
 6414                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6415                        )
 6416
 6417                    else:
 6418
 6419                        log.info(
 6420                            f"Annotation '{annotation_name}' - No Annotations available"
 6421                        )
 6422
 6423                    log.debug("Final header: " + str(vcf_reader.infos))
 6424
 6425        # Remove added columns
 6426        for added_column in added_columns:
 6427            self.drop_column(column=added_column)
 6428
 6429    def annotation_splice(self, threads: int = None) -> None:
 6430        """
 6431        This function annotate with snpEff
 6432
 6433        :param threads: The number of threads to use
 6434        :return: the value of the variable "return_value".
 6435        """
 6436
 6437        # DEBUG
 6438        log.debug("Start annotation with splice tools")
 6439
 6440        # Threads
 6441        if not threads:
 6442            threads = self.get_threads()
 6443        log.debug("Threads: " + str(threads))
 6444
 6445        # DEBUG
 6446        delete_tmp = True
 6447        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6448            delete_tmp = False
 6449            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6450
 6451        # Config
 6452        config = self.get_config()
 6453        log.debug("Config: " + str(config))
 6454        splice_config = config.get("tools", {}).get("splice", {})
 6455        if not splice_config:
 6456            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6457            msg_err = "No Splice tool config"
 6458            raise ValueError(msg_err)
 6459        log.debug(f"splice_config: {splice_config}")
 6460
 6461        # Config - Folders - Databases
 6462        databases_folders = (
 6463            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6464        )
 6465        log.debug("Databases annotations: " + str(databases_folders))
 6466
 6467        # Splice docker image
 6468        splice_docker_image = splice_config.get("docker").get("image")
 6469
 6470        # Pull splice image if it's not already there
 6471        if not check_docker_image_exists(splice_docker_image):
 6472            log.warning(
 6473                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6474            )
 6475            try:
 6476                command(f"docker pull {splice_config.get('docker').get('image')}")
 6477            except subprocess.CalledProcessError:
 6478                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6479                log.error(msg_err)
 6480                raise ValueError(msg_err)
 6481
 6482        # Config - splice databases
 6483        splice_databases = (
 6484            config.get("folders", {})
 6485            .get("databases", {})
 6486            .get("splice", DEFAULT_SPLICE_FOLDER)
 6487        )
 6488        splice_databases = full_path(splice_databases)
 6489
 6490        # Param
 6491        param = self.get_param()
 6492        log.debug("Param: " + str(param))
 6493
 6494        # Param
 6495        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6496        log.debug("Options: " + str(options))
 6497
 6498        # Data
 6499        table_variants = self.get_table_variants()
 6500
 6501        # Check if not empty
 6502        log.debug("Check if not empty")
 6503        sql_query_chromosomes = (
 6504            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6505        )
 6506        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6507            log.info("VCF empty")
 6508            return None
 6509
 6510        # Export in VCF
 6511        log.debug("Create initial file to annotate")
 6512
 6513        # Create output folder / work folder
 6514        if options.get("output_folder", ""):
 6515            output_folder = options.get("output_folder", "")
 6516            if not os.path.exists(output_folder):
 6517                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6518        else:
 6519            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6520            if not os.path.exists(output_folder):
 6521                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6522
 6523        if options.get("workdir", ""):
 6524            workdir = options.get("workdir", "")
 6525        else:
 6526            workdir = "/work"
 6527
 6528        # Create tmp VCF file
 6529        tmp_vcf = NamedTemporaryFile(
 6530            prefix=self.get_prefix(),
 6531            dir=output_folder,
 6532            suffix=".vcf",
 6533            delete=False,
 6534        )
 6535        tmp_vcf_name = tmp_vcf.name
 6536
 6537        # VCF header
 6538        header = self.get_header()
 6539
 6540        # Existing annotations
 6541        for vcf_annotation in self.get_header().infos:
 6542
 6543            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6544            log.debug(
 6545                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6546            )
 6547
 6548        # Memory limit
 6549        if config.get("memory", None):
 6550            memory_limit = config.get("memory", "8G").upper()
 6551            # upper()
 6552        else:
 6553            memory_limit = "8G"
 6554        log.debug(f"memory_limit: {memory_limit}")
 6555
 6556        # Check number of variants to annotate
 6557        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6558        where_clause_regex_spip = r"SPiP_\w+"
 6559        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6560        df_list_of_variants_to_annotate = self.get_query_to_df(
 6561            query=f""" SELECT * FROM variants {where_clause} """
 6562        )
 6563        if len(df_list_of_variants_to_annotate) == 0:
 6564            log.warning(
 6565                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6566            )
 6567            return None
 6568        else:
 6569            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6570
 6571        # Export VCF file
 6572        self.export_variant_vcf(
 6573            vcf_file=tmp_vcf_name,
 6574            remove_info=True,
 6575            add_samples=True,
 6576            index=False,
 6577            where_clause=where_clause,
 6578        )
 6579        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6580        if any(value for value in splice_config.values() if value is None):
 6581            log.warning("At least one splice config parameter is empty")
 6582            # exit annotation_splice
 6583            return None
 6584
 6585        # Params in splice nf
 6586        def check_values(dico: dict):
 6587            """
 6588            Ensure parameters for NF splice pipeline
 6589            """
 6590            for key, val in dico.items():
 6591                if key == "genome":
 6592                    if any(
 6593                        assemb in options.get("genome", {})
 6594                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6595                    ):
 6596                        yield f"--{key} hg19"
 6597                    elif any(
 6598                        assemb in options.get("genome", {})
 6599                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6600                    ):
 6601                        yield f"--{key} hg38"
 6602                elif (
 6603                    (isinstance(val, str) and val)
 6604                    or isinstance(val, int)
 6605                    or isinstance(val, bool)
 6606                ):
 6607                    yield f"--{key} {val}"
 6608
 6609        # Genome
 6610        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6611        options["genome"] = genome
 6612        # NF params
 6613        nf_params = []
 6614        # Add options
 6615        if options:
 6616            log.debug(options)
 6617            nf_params = list(check_values(options))
 6618            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6619        else:
 6620            log.debug("No NF params provided")
 6621        # Add threads
 6622        if "threads" not in options.keys():
 6623            nf_params.append(f"--threads {threads}")
 6624        # Genome path
 6625        genome_path = find_genome(
 6626            config.get("folders", {})
 6627            .get("databases", {})
 6628            .get("genomes", DEFAULT_GENOME_FOLDER),
 6629            file=f"{genome}.fa",
 6630        )
 6631        # Add genome path
 6632        if not genome_path:
 6633            raise ValueError(
 6634                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6635            )
 6636        else:
 6637            log.debug(f"Genome: {genome_path}")
 6638            nf_params.append(f"--genome_path {genome_path}")
 6639
 6640        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6641            """
 6642            Setting up updated databases for SPiP and SpliceAI
 6643            """
 6644
 6645            try:
 6646
 6647                # SpliceAI assembly transcriptome
 6648                spliceai_assembly = os.path.join(
 6649                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6650                    options.get("genome"),
 6651                    "transcriptome",
 6652                )
 6653                spip_assembly = options.get("genome")
 6654
 6655                spip = find(
 6656                    f"transcriptome_{spip_assembly}.RData",
 6657                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6658                )
 6659                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6660                log.debug(f"SPiP annotations: {spip}")
 6661                log.debug(f"SpliceAI annotations: {spliceai}")
 6662                if spip and spliceai:
 6663                    return [
 6664                        f"--spip_transcriptome {spip}",
 6665                        f"--spliceai_transcriptome {spliceai}",
 6666                    ]
 6667                else:
 6668                    log.warning(
 6669                        "Can't find splice databases in configuration, use annotations file from image"
 6670                    )
 6671            except TypeError:
 6672                log.warning(
 6673                    "Can't find splice databases in configuration, use annotations file from image"
 6674                )
 6675                return []
 6676
 6677        # Add options, check if transcriptome option have already beend provided
 6678        if (
 6679            "spip_transcriptome" not in nf_params
 6680            and "spliceai_transcriptome" not in nf_params
 6681        ):
 6682            splice_reference = splice_annotations(options, config)
 6683            if splice_reference:
 6684                nf_params.extend(splice_reference)
 6685        # nf_params.append(f"--output_folder {output_folder}")
 6686        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6687        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6688        log.debug(cmd)
 6689        splice_config["docker"]["command"] = cmd
 6690
 6691        # Ensure proxy is set
 6692        proxy = [
 6693            f"-e {var}={os.getenv(var)}"
 6694            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6695            if os.getenv(var) is not None
 6696        ]
 6697        docker_cmd = get_bin_command(
 6698            tool="splice",
 6699            bin_type="docker",
 6700            config=config,
 6701            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6702            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6703        )
 6704        # print(docker_cmd)
 6705        # exit()
 6706        # Docker debug
 6707        # if splice_config.get("rm_container"):
 6708        #     rm_container = "--rm"
 6709        # else:
 6710        #     rm_container = ""
 6711        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6712        log.debug(docker_cmd)
 6713        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6714        log.debug(res.stdout)
 6715        if res.stderr:
 6716            log.error(res.stderr)
 6717        res.check_returncode()
 6718        # Update variants
 6719        log.info("Annotation - Updating...")
 6720        # Test find output vcf
 6721        log.debug(
 6722            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6723        )
 6724        output_vcf = []
 6725        # Wrong folder to look in
 6726        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6727            if (
 6728                files
 6729                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6730            ):
 6731                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6732        # log.debug(os.listdir(options.get("output_folder")))
 6733        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6734        if not output_vcf:
 6735            log.debug(
 6736                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6737            )
 6738        else:
 6739            # Get new header from annotated vcf
 6740            log.debug(f"Initial header: {len(header.infos)} fields")
 6741            # Create new header with splice infos
 6742            new_vcf = Variants(input=output_vcf[0])
 6743            new_vcf_header = new_vcf.get_header().infos
 6744            for keys, infos in new_vcf_header.items():
 6745                if keys not in header.infos.keys():
 6746                    header.infos[keys] = infos
 6747            log.debug(f"New header: {len(header.infos)} fields")
 6748            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6749            self.update_from_vcf(output_vcf[0])
 6750
 6751        # Remove file
 6752        remove_if_exists(output_vcf)
 6753
 6754    ###
 6755    # Prioritization
 6756    ###
 6757
 6758    def get_config_default(self, name: str) -> dict:
 6759        """
 6760        The function `get_config_default` returns a dictionary containing default configurations for
 6761        various calculations and prioritizations.
 6762
 6763        :param name: The `get_config_default` function returns a dictionary containing default
 6764        configurations for different calculations and prioritizations. The `name` parameter is used to
 6765        specify which specific configuration to retrieve from the dictionary
 6766        :type name: str
 6767        :return: The function `get_config_default` returns a dictionary containing default configuration
 6768        settings for different calculations and prioritizations. The specific configuration settings are
 6769        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6770        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6771        returned. If there is no match, an empty dictionary is returned.
 6772        """
 6773
 6774        config_default = {
 6775            "calculations": {
 6776                "variant_chr_pos_alt_ref": {
 6777                    "type": "sql",
 6778                    "name": "variant_chr_pos_alt_ref",
 6779                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6780                    "available": False,
 6781                    "output_column_name": "variant_chr_pos_alt_ref",
 6782                    "output_column_type": "String",
 6783                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6784                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6785                    "operation_info": True,
 6786                },
 6787                "VARTYPE": {
 6788                    "type": "sql",
 6789                    "name": "VARTYPE",
 6790                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6791                    "available": True,
 6792                    "table": "variants",
 6793                    "output_column_name": "VARTYPE",
 6794                    "output_column_type": "String",
 6795                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6796                    "operation_query": """
 6797                            CASE
 6798                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6799                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6800                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6801                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6802                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6803                                ELSE 'UNDEFINED'
 6804                            END
 6805                            """,
 6806                    "info_fields": ["SVTYPE"],
 6807                    "operation_info": True,
 6808                },
 6809                "snpeff_hgvs": {
 6810                    "type": "python",
 6811                    "name": "snpeff_hgvs",
 6812                    "description": "HGVS nomenclatures from snpEff annotation",
 6813                    "available": True,
 6814                    "function_name": "calculation_extract_snpeff_hgvs",
 6815                    "function_params": ["snpeff_hgvs", "ANN"],
 6816                },
 6817                "snpeff_ann_explode": {
 6818                    "type": "python",
 6819                    "name": "snpeff_ann_explode",
 6820                    "description": "Explode snpEff annotations with uniquify values",
 6821                    "available": True,
 6822                    "function_name": "calculation_snpeff_ann_explode",
 6823                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6824                },
 6825                "snpeff_ann_explode_uniquify": {
 6826                    "type": "python",
 6827                    "name": "snpeff_ann_explode_uniquify",
 6828                    "description": "Explode snpEff annotations",
 6829                    "available": True,
 6830                    "function_name": "calculation_snpeff_ann_explode",
 6831                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6832                },
 6833                "snpeff_ann_explode_json": {
 6834                    "type": "python",
 6835                    "name": "snpeff_ann_explode_json",
 6836                    "description": "Explode snpEff annotations in JSON format",
 6837                    "available": True,
 6838                    "function_name": "calculation_snpeff_ann_explode",
 6839                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6840                },
 6841                "NOMEN": {
 6842                    "type": "python",
 6843                    "name": "NOMEN",
 6844                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6845                    "available": True,
 6846                    "function_name": "calculation_extract_nomen",
 6847                    "function_params": [],
 6848                },
 6849                "RENAME_INFO_FIELDS": {
 6850                    "type": "python",
 6851                    "name": "RENAME_INFO_FIELDS",
 6852                    "description": "Rename or remove INFO/tags",
 6853                    "available": True,
 6854                    "function_name": "calculation_rename_info_fields",
 6855                    "function_params": [],
 6856                },
 6857                "FINDBYPIPELINE": {
 6858                    "type": "python",
 6859                    "name": "FINDBYPIPELINE",
 6860                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6861                    "available": True,
 6862                    "function_name": "calculation_find_by_pipeline",
 6863                    "function_params": ["findbypipeline"],
 6864                },
 6865                "FINDBYSAMPLE": {
 6866                    "type": "python",
 6867                    "name": "FINDBYSAMPLE",
 6868                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6869                    "available": True,
 6870                    "function_name": "calculation_find_by_pipeline",
 6871                    "function_params": ["findbysample"],
 6872                },
 6873                "GENOTYPECONCORDANCE": {
 6874                    "type": "python",
 6875                    "name": "GENOTYPECONCORDANCE",
 6876                    "description": "Concordance of genotype for multi caller VCF",
 6877                    "available": True,
 6878                    "function_name": "calculation_genotype_concordance",
 6879                    "function_params": [],
 6880                },
 6881                "BARCODE": {
 6882                    "type": "python",
 6883                    "name": "BARCODE",
 6884                    "description": "BARCODE as VaRank tool",
 6885                    "available": True,
 6886                    "function_name": "calculation_barcode",
 6887                    "function_params": [],
 6888                },
 6889                "BARCODEFAMILY": {
 6890                    "type": "python",
 6891                    "name": "BARCODEFAMILY",
 6892                    "description": "BARCODEFAMILY as VaRank tool",
 6893                    "available": True,
 6894                    "function_name": "calculation_barcode_family",
 6895                    "function_params": ["BCF"],
 6896                },
 6897                "TRIO": {
 6898                    "type": "python",
 6899                    "name": "TRIO",
 6900                    "description": "Inheritance for a trio family",
 6901                    "available": True,
 6902                    "function_name": "calculation_trio",
 6903                    "function_params": [],
 6904                },
 6905                "VAF": {
 6906                    "type": "python",
 6907                    "name": "VAF",
 6908                    "description": "Variant Allele Frequency (VAF) harmonization",
 6909                    "available": True,
 6910                    "function_name": "calculation_vaf_normalization",
 6911                    "function_params": [],
 6912                },
 6913                "VAF_stats": {
 6914                    "type": "python",
 6915                    "name": "VAF_stats",
 6916                    "description": "Variant Allele Frequency (VAF) statistics",
 6917                    "available": True,
 6918                    "function_name": "calculation_genotype_stats",
 6919                    "function_params": ["VAF"],
 6920                },
 6921                "DP_stats": {
 6922                    "type": "python",
 6923                    "name": "DP_stats",
 6924                    "description": "Depth (DP) statistics",
 6925                    "available": True,
 6926                    "function_name": "calculation_genotype_stats",
 6927                    "function_params": ["DP"],
 6928                },
 6929                "variant_id": {
 6930                    "type": "python",
 6931                    "name": "variant_id",
 6932                    "description": "Variant ID generated from variant position and type",
 6933                    "available": True,
 6934                    "function_name": "calculation_variant_id",
 6935                    "function_params": [],
 6936                },
 6937                "transcripts_json": {
 6938                    "type": "python",
 6939                    "name": "transcripts_json",
 6940                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6941                    "available": True,
 6942                    "function_name": "calculation_transcripts_annotation",
 6943                    "function_params": ["transcripts_json", None],
 6944                },
 6945                "transcripts_ann": {
 6946                    "type": "python",
 6947                    "name": "transcripts_ann",
 6948                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6949                    "available": True,
 6950                    "function_name": "calculation_transcripts_annotation",
 6951                    "function_params": [None, "transcripts_ann"],
 6952                },
 6953                "transcripts_annotations": {
 6954                    "type": "python",
 6955                    "name": "transcripts_annotations",
 6956                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6957                    "available": True,
 6958                    "function_name": "calculation_transcripts_annotation",
 6959                    "function_params": [None, None],
 6960                },
 6961                "transcripts_prioritization": {
 6962                    "type": "python",
 6963                    "name": "transcripts_prioritization",
 6964                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6965                    "available": True,
 6966                    "function_name": "calculation_transcripts_prioritization",
 6967                    "function_params": [],
 6968                },
 6969                "transcripts_export": {
 6970                    "type": "python",
 6971                    "name": "transcripts_export",
 6972                    "description": "Export transcripts table/view as a file (using param.json)",
 6973                    "available": True,
 6974                    "function_name": "calculation_transcripts_export",
 6975                    "function_params": [],
 6976                },
 6977            },
 6978            "prioritizations": {
 6979                "default": {
 6980                    "ANN2": [
 6981                        {
 6982                            "type": "contains",
 6983                            "value": "HIGH",
 6984                            "score": 5,
 6985                            "flag": "PASS",
 6986                            "comment": [
 6987                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6988                            ],
 6989                        },
 6990                        {
 6991                            "type": "contains",
 6992                            "value": "MODERATE",
 6993                            "score": 3,
 6994                            "flag": "PASS",
 6995                            "comment": [
 6996                                "A non-disruptive variant that might change protein effectiveness"
 6997                            ],
 6998                        },
 6999                        {
 7000                            "type": "contains",
 7001                            "value": "LOW",
 7002                            "score": 0,
 7003                            "flag": "FILTERED",
 7004                            "comment": [
 7005                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 7006                            ],
 7007                        },
 7008                        {
 7009                            "type": "contains",
 7010                            "value": "MODIFIER",
 7011                            "score": 0,
 7012                            "flag": "FILTERED",
 7013                            "comment": [
 7014                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 7015                            ],
 7016                        },
 7017                    ],
 7018                }
 7019            },
 7020        }
 7021
 7022        return config_default.get(name, None)
 7023
 7024    def get_config_json(
 7025        self, name: str, config_dict: dict = {}, config_file: str = None
 7026    ) -> dict:
 7027        """
 7028        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7029        default values, a dictionary, and a file.
 7030
 7031        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7032        the name of the configuration. It is used to identify and retrieve the configuration settings
 7033        for a specific component or module
 7034        :type name: str
 7035        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7036        dictionary that allows you to provide additional configuration settings or overrides. When you
 7037        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7038        the key is the configuration setting you want to override or
 7039        :type config_dict: dict
 7040        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7041        specify the path to a configuration file that contains additional settings. If provided, the
 7042        function will read the contents of this file and update the configuration dictionary with the
 7043        values found in the file, overriding any existing values with the
 7044        :type config_file: str
 7045        :return: The function `get_config_json` returns a dictionary containing the configuration
 7046        settings.
 7047        """
 7048
 7049        # Create with default prioritizations
 7050        config_default = self.get_config_default(name=name)
 7051        configuration = config_default
 7052        # log.debug(f"configuration={configuration}")
 7053
 7054        # Replace prioritizations from dict
 7055        for config in config_dict:
 7056            configuration[config] = config_dict[config]
 7057
 7058        # Replace prioritizations from file
 7059        config_file = full_path(config_file)
 7060        if config_file:
 7061            if os.path.exists(config_file):
 7062                with open(config_file) as config_file_content:
 7063                    config_file_dict = yaml.safe_load(config_file_content)
 7064                for config in config_file_dict:
 7065                    configuration[config] = config_file_dict[config]
 7066            else:
 7067                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7068                log.error(msg_error)
 7069                raise ValueError(msg_error)
 7070
 7071        return configuration
 7072
 7073    def prioritization(
 7074        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7075    ) -> bool:
 7076        """
 7077        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7078        prioritizes variants based on configured profiles and criteria.
 7079
 7080        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7081        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7082        a table name is provided, the method will prioritize the variants in that specific table
 7083        :type table: str
 7084        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7085        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7086        provided, the code will use a default prefix value of "PZ"
 7087        :type pz_prefix: str
 7088        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7089        additional parameters specific to the prioritization process. These parameters can include
 7090        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7091        configurations needed for the prioritization of variants in a V
 7092        :type pz_param: dict
 7093        :return: A boolean value (True) is being returned from the `prioritization` function.
 7094        """
 7095
 7096        # Config
 7097        config = self.get_config()
 7098
 7099        # Param
 7100        param = self.get_param()
 7101
 7102        # Prioritization param
 7103        if pz_param is not None:
 7104            prioritization_param = pz_param
 7105        else:
 7106            prioritization_param = param.get("prioritization", {})
 7107
 7108        # Configuration profiles
 7109        prioritization_config_file = prioritization_param.get(
 7110            "prioritization_config", None
 7111        )
 7112        prioritization_config_file = full_path(prioritization_config_file)
 7113        prioritizations_config = self.get_config_json(
 7114            name="prioritizations", config_file=prioritization_config_file
 7115        )
 7116
 7117        # Prioritization prefix
 7118        pz_prefix_default = "PZ"
 7119        if pz_prefix is None:
 7120            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7121
 7122        # Prioritization options
 7123        profiles = prioritization_param.get("profiles", [])
 7124        if isinstance(profiles, str):
 7125            profiles = profiles.split(",")
 7126        pzfields = prioritization_param.get(
 7127            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7128        )
 7129        if isinstance(pzfields, str):
 7130            pzfields = pzfields.split(",")
 7131        default_profile = prioritization_param.get("default_profile", None)
 7132        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7133        prioritization_score_mode = prioritization_param.get(
 7134            "prioritization_score_mode", "HOWARD"
 7135        )
 7136
 7137        # Quick Prioritizations
 7138        prioritizations = param.get("prioritizations", None)
 7139        if prioritizations:
 7140            log.info("Quick Prioritization:")
 7141            for profile in prioritizations.split(","):
 7142                if profile not in profiles:
 7143                    profiles.append(profile)
 7144                    log.info(f"   {profile}")
 7145
 7146        # If profile "ALL" provided, all profiles in the config profiles
 7147        if "ALL" in profiles:
 7148            profiles = list(prioritizations_config.keys())
 7149
 7150        for profile in profiles:
 7151            if prioritizations_config.get(profile, None):
 7152                log.debug(f"Profile '{profile}' configured")
 7153            else:
 7154                msg_error = f"Profile '{profile}' NOT configured"
 7155                log.error(msg_error)
 7156                raise ValueError(msg_error)
 7157
 7158        if profiles:
 7159            log.info(f"Prioritization... ")
 7160        else:
 7161            log.debug(f"No profile defined")
 7162            return False
 7163
 7164        if not default_profile and len(profiles):
 7165            default_profile = profiles[0]
 7166
 7167        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7168        log.debug("Profiles to check: " + str(list(profiles)))
 7169
 7170        # Variables
 7171        if table is not None:
 7172            table_variants = table
 7173        else:
 7174            table_variants = self.get_table_variants(clause="update")
 7175        log.debug(f"Table to prioritize: {table_variants}")
 7176
 7177        # Added columns
 7178        added_columns = []
 7179
 7180        # Create list of PZfields
 7181        # List of PZFields
 7182        list_of_pzfields_original = pzfields + [
 7183            pzfield + pzfields_sep + profile
 7184            for pzfield in pzfields
 7185            for profile in profiles
 7186        ]
 7187        list_of_pzfields = []
 7188        log.debug(f"{list_of_pzfields_original}")
 7189
 7190        # Remove existing PZfields to use if exists
 7191        for pzfield in list_of_pzfields_original:
 7192            if self.get_header().infos.get(pzfield, None) is None:
 7193                list_of_pzfields.append(pzfield)
 7194                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7195            else:
 7196                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7197
 7198        if list_of_pzfields:
 7199
 7200            # Explode Infos prefix
 7201            explode_infos_prefix = self.get_explode_infos_prefix()
 7202
 7203            # PZfields tags description
 7204            PZfields_INFOS = {
 7205                f"{pz_prefix}Tags": {
 7206                    "ID": f"{pz_prefix}Tags",
 7207                    "Number": ".",
 7208                    "Type": "String",
 7209                    "Description": "Variant tags based on annotation criteria",
 7210                },
 7211                f"{pz_prefix}Score": {
 7212                    "ID": f"{pz_prefix}Score",
 7213                    "Number": 1,
 7214                    "Type": "Integer",
 7215                    "Description": "Variant score based on annotation criteria",
 7216                },
 7217                f"{pz_prefix}Flag": {
 7218                    "ID": f"{pz_prefix}Flag",
 7219                    "Number": 1,
 7220                    "Type": "String",
 7221                    "Description": "Variant flag based on annotation criteria",
 7222                },
 7223                f"{pz_prefix}Comment": {
 7224                    "ID": f"{pz_prefix}Comment",
 7225                    "Number": ".",
 7226                    "Type": "String",
 7227                    "Description": "Variant comment based on annotation criteria",
 7228                },
 7229                f"{pz_prefix}Infos": {
 7230                    "ID": f"{pz_prefix}Infos",
 7231                    "Number": ".",
 7232                    "Type": "String",
 7233                    "Description": "Variant infos based on annotation criteria",
 7234                },
 7235                f"{pz_prefix}Class": {
 7236                    "ID": f"{pz_prefix}Class",
 7237                    "Number": ".",
 7238                    "Type": "String",
 7239                    "Description": "Variant class based on annotation criteria",
 7240                },
 7241            }
 7242
 7243            # Create INFO fields if not exist
 7244            for field in PZfields_INFOS:
 7245                field_ID = PZfields_INFOS[field]["ID"]
 7246                field_description = PZfields_INFOS[field]["Description"]
 7247                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7248                    field_description = (
 7249                        PZfields_INFOS[field]["Description"]
 7250                        + f", profile {default_profile}"
 7251                    )
 7252                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7253                        field_ID,
 7254                        PZfields_INFOS[field]["Number"],
 7255                        PZfields_INFOS[field]["Type"],
 7256                        field_description,
 7257                        "unknown",
 7258                        "unknown",
 7259                        code_type_map[PZfields_INFOS[field]["Type"]],
 7260                    )
 7261
 7262            # Create INFO fields if not exist for each profile
 7263            for profile in prioritizations_config:
 7264                if profile in profiles or profiles == []:
 7265                    for field in PZfields_INFOS:
 7266                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7267                        field_description = (
 7268                            PZfields_INFOS[field]["Description"]
 7269                            + f", profile {profile}"
 7270                        )
 7271                        if (
 7272                            field_ID not in self.get_header().infos
 7273                            and field in pzfields
 7274                        ):
 7275                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7276                                field_ID,
 7277                                PZfields_INFOS[field]["Number"],
 7278                                PZfields_INFOS[field]["Type"],
 7279                                field_description,
 7280                                "unknown",
 7281                                "unknown",
 7282                                code_type_map[PZfields_INFOS[field]["Type"]],
 7283                            )
 7284
 7285            # Header
 7286            for pzfield in list_of_pzfields:
 7287                if re.match(f"{pz_prefix}Score.*", pzfield):
 7288                    added_column = self.add_column(
 7289                        table_name=table_variants,
 7290                        column_name=pzfield,
 7291                        column_type="INTEGER",
 7292                        default_value="0",
 7293                    )
 7294                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7295                    added_column = self.add_column(
 7296                        table_name=table_variants,
 7297                        column_name=pzfield,
 7298                        column_type="BOOLEAN",
 7299                        default_value="1",
 7300                    )
 7301                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7302                    added_column = self.add_column(
 7303                        table_name=table_variants,
 7304                        column_name=pzfield,
 7305                        column_type="VARCHAR[]",
 7306                        default_value="null",
 7307                    )
 7308                else:
 7309                    added_column = self.add_column(
 7310                        table_name=table_variants,
 7311                        column_name=pzfield,
 7312                        column_type="STRING",
 7313                        default_value="''",
 7314                    )
 7315                added_columns.append(added_column)
 7316
 7317            # Profiles
 7318            if profiles:
 7319
 7320                # foreach profile in configuration file
 7321                for profile in prioritizations_config:
 7322
 7323                    # If profile is asked in param, or ALL are asked (empty profile [])
 7324                    if profile in profiles or profiles == []:
 7325                        log.info(f"Profile '{profile}'")
 7326
 7327                        sql_set_info_option = ""
 7328
 7329                        sql_set_info = []
 7330
 7331                        # PZ fields set
 7332
 7333                        # PZScore
 7334                        if (
 7335                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7336                            in list_of_pzfields
 7337                        ):
 7338                            sql_set_info.append(
 7339                                f"""
 7340                                    concat(
 7341                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7342                                        {pz_prefix}Score{pzfields_sep}{profile}
 7343                                    ) 
 7344                                """
 7345                            )
 7346                            if (
 7347                                profile == default_profile
 7348                                and f"{pz_prefix}Score" in list_of_pzfields
 7349                            ):
 7350                                sql_set_info.append(
 7351                                    f"""
 7352                                        concat(
 7353                                            '{pz_prefix}Score=',
 7354                                            {pz_prefix}Score{pzfields_sep}{profile}
 7355                                        )
 7356                                    """
 7357                                )
 7358
 7359                        # PZFlag
 7360                        if (
 7361                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7362                            in list_of_pzfields
 7363                        ):
 7364                            sql_set_info.append(
 7365                                f"""
 7366                                    concat(
 7367                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7368                                        CASE 
 7369                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7370                                            THEN 'PASS'
 7371                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7372                                            THEN 'FILTERED'
 7373                                        END
 7374                                    ) 
 7375                                """
 7376                            )
 7377                            if (
 7378                                profile == default_profile
 7379                                and f"{pz_prefix}Flag" in list_of_pzfields
 7380                            ):
 7381                                sql_set_info.append(
 7382                                    f"""
 7383                                        concat(
 7384                                            '{pz_prefix}Flag=',
 7385                                            CASE 
 7386                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7387                                                THEN 'PASS'
 7388                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7389                                                THEN 'FILTERED'
 7390                                            END
 7391                                        )
 7392                                    """
 7393                                )
 7394
 7395                        # PZClass
 7396                        if (
 7397                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7398                            in list_of_pzfields
 7399                        ):
 7400                            sql_set_info.append(
 7401                                f"""
 7402                                    concat(
 7403                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7404                                        CASE
 7405                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7406                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7407                                            ELSE '.'
 7408                                        END 
 7409                                    )
 7410                                    
 7411                                """
 7412                            )
 7413                            if (
 7414                                profile == default_profile
 7415                                and f"{pz_prefix}Class" in list_of_pzfields
 7416                            ):
 7417                                sql_set_info.append(
 7418                                    f"""
 7419                                        concat(
 7420                                            '{pz_prefix}Class=',
 7421                                            CASE
 7422                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7423                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7424                                                ELSE '.'
 7425                                            END 
 7426                                        )
 7427                                    """
 7428                                )
 7429
 7430                        # PZComment
 7431                        if (
 7432                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7433                            in list_of_pzfields
 7434                        ):
 7435                            sql_set_info.append(
 7436                                f"""
 7437                                    CASE
 7438                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7439                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7440                                        ELSE ''
 7441                                    END
 7442                                """
 7443                            )
 7444                            if (
 7445                                profile == default_profile
 7446                                and f"{pz_prefix}Comment" in list_of_pzfields
 7447                            ):
 7448                                sql_set_info.append(
 7449                                    f"""
 7450                                        CASE
 7451                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7452                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7453                                            ELSE ''
 7454                                        END
 7455                                    """
 7456                                )
 7457
 7458                        # PZInfos
 7459                        if (
 7460                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7461                            in list_of_pzfields
 7462                        ):
 7463                            sql_set_info.append(
 7464                                f"""
 7465                                    CASE
 7466                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7467                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7468                                        ELSE ''
 7469                                    END
 7470                                """
 7471                            )
 7472                            if (
 7473                                profile == default_profile
 7474                                and f"{pz_prefix}Infos" in list_of_pzfields
 7475                            ):
 7476                                sql_set_info.append(
 7477                                    f"""
 7478                                        CASE
 7479                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7480                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7481                                            ELSE ''
 7482                                        END
 7483                                    """
 7484                                )
 7485
 7486                        # Merge PZfields
 7487                        sql_set_info_option = ""
 7488                        sql_set_sep = ""
 7489                        for sql_set in sql_set_info:
 7490                            if sql_set_sep:
 7491                                sql_set_info_option += f"""
 7492                                    , concat('{sql_set_sep}', {sql_set})
 7493                                """
 7494                            else:
 7495                                sql_set_info_option += f"""
 7496                                    , {sql_set}
 7497                                """
 7498                            sql_set_sep = ";"
 7499
 7500                        sql_queries = []
 7501                        for annotation in prioritizations_config[profile]:
 7502
 7503                            # skip special sections
 7504                            if annotation.startswith("_"):
 7505                                continue
 7506
 7507                            # For each criterions
 7508                            for criterion in prioritizations_config[profile][
 7509                                annotation
 7510                            ]:
 7511
 7512                                # Criterion mode
 7513                                criterion_mode = None
 7514                                if np.any(
 7515                                    np.isin(list(criterion.keys()), ["type", "value"])
 7516                                ):
 7517                                    criterion_mode = "operation"
 7518                                elif np.any(
 7519                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7520                                ):
 7521                                    criterion_mode = "sql"
 7522                                log.debug(f"Criterion Mode: {criterion_mode}")
 7523
 7524                                # Criterion parameters
 7525                                criterion_type = criterion.get("type", None)
 7526                                criterion_value = criterion.get("value", None)
 7527                                criterion_sql = criterion.get("sql", None)
 7528                                criterion_fields = criterion.get("fields", None)
 7529                                criterion_score = criterion.get("score", 0)
 7530                                criterion_flag = criterion.get("flag", "PASS")
 7531                                criterion_class = criterion.get("class", None)
 7532                                criterion_flag_bool = criterion_flag == "PASS"
 7533                                criterion_comment = (
 7534                                    ", ".join(criterion.get("comment", []))
 7535                                    .replace("'", "''")
 7536                                    .replace(";", ",")
 7537                                    .replace("\t", " ")
 7538                                )
 7539                                criterion_infos = (
 7540                                    str(criterion)
 7541                                    .replace("'", "''")
 7542                                    .replace(";", ",")
 7543                                    .replace("\t", " ")
 7544                                )
 7545
 7546                                # SQL
 7547                                if criterion_sql is not None and isinstance(
 7548                                    criterion_sql, list
 7549                                ):
 7550                                    criterion_sql = " ".join(criterion_sql)
 7551
 7552                                # Fields and explode
 7553                                if criterion_fields is None:
 7554                                    criterion_fields = [annotation]
 7555                                if not isinstance(criterion_fields, list):
 7556                                    criterion_fields = str(criterion_fields).split(",")
 7557
 7558                                # Class
 7559                                if criterion_class is not None and not isinstance(
 7560                                    criterion_class, list
 7561                                ):
 7562                                    criterion_class = str(criterion_class).split(",")
 7563
 7564                                for annotation_field in criterion_fields:
 7565
 7566                                    # Explode specific annotation
 7567                                    log.debug(
 7568                                        f"Explode annotation '{annotation_field}'"
 7569                                    )
 7570                                    added_columns += self.explode_infos(
 7571                                        prefix=explode_infos_prefix,
 7572                                        fields=[annotation_field],
 7573                                        table=table_variants,
 7574                                    )
 7575                                    extra_infos = self.get_extra_infos(
 7576                                        table=table_variants
 7577                                    )
 7578
 7579                                    # Check if annotation field is present
 7580                                    if (
 7581                                        f"{explode_infos_prefix}{annotation_field}"
 7582                                        not in extra_infos
 7583                                    ):
 7584                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7585                                        log.error(msq_err)
 7586                                        raise ValueError(msq_err)
 7587                                    else:
 7588                                        log.debug(
 7589                                            f"Annotation '{annotation_field}' in data"
 7590                                        )
 7591
 7592                                sql_set = []
 7593                                sql_set_info = []
 7594
 7595                                # PZ fields set
 7596
 7597                                # PZScore
 7598                                if (
 7599                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7600                                    in list_of_pzfields
 7601                                ):
 7602                                    # VaRank prioritization score mode
 7603                                    if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]:
 7604                                        sql_set.append(
 7605                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
 7606                                        )
 7607                                    # default HOWARD prioritization score mode
 7608                                    else:
 7609                                        sql_set.append(
 7610                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7611                                        )
 7612
 7613                                # PZFlag
 7614                                if (
 7615                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7616                                    in list_of_pzfields
 7617                                ):
 7618                                    sql_set.append(
 7619                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7620                                    )
 7621
 7622                                # PZClass
 7623                                if (
 7624                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7625                                    in list_of_pzfields
 7626                                    and criterion_class is not None
 7627                                ):
 7628                                    sql_set.append(
 7629                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7630                                    )
 7631
 7632                                # PZComment
 7633                                if (
 7634                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7635                                    in list_of_pzfields
 7636                                ):
 7637                                    sql_set.append(
 7638                                        f"""
 7639                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7640                                                concat(
 7641                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7642                                                    CASE 
 7643                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7644                                                        THEN ', '
 7645                                                        ELSE ''
 7646                                                    END,
 7647                                                    '{criterion_comment}'
 7648                                                )
 7649                                        """
 7650                                    )
 7651
 7652                                # PZInfos
 7653                                if (
 7654                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7655                                    in list_of_pzfields
 7656                                ):
 7657                                    sql_set.append(
 7658                                        f"""
 7659                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7660                                                concat(
 7661                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7662                                                    '{criterion_infos}'
 7663                                                )
 7664                                        """
 7665                                    )
 7666                                sql_set_option = ",".join(sql_set)
 7667
 7668                                # Criterion and comparison
 7669                                if sql_set_option:
 7670
 7671                                    if criterion_mode in ["operation"]:
 7672
 7673                                        try:
 7674                                            float(criterion_value)
 7675                                            sql_update = f"""
 7676                                                UPDATE {table_variants}
 7677                                                SET {sql_set_option}
 7678                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7679                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7680                                            """
 7681                                        except:
 7682                                            contains_option = ""
 7683                                            if criterion_type == "contains":
 7684                                                contains_option = ".*"
 7685                                            sql_update = f"""
 7686                                                UPDATE {table_variants}
 7687                                                SET {sql_set_option}
 7688                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7689                                            """
 7690                                        sql_queries.append(sql_update)
 7691
 7692                                    elif criterion_mode in ["sql"]:
 7693
 7694                                        sql_update = f"""
 7695                                            UPDATE {table_variants}
 7696                                            SET {sql_set_option}
 7697                                            WHERE {criterion_sql}
 7698                                        """
 7699                                        sql_queries.append(sql_update)
 7700
 7701                                    else:
 7702                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7703                                        log.error(msg_err)
 7704                                        raise ValueError(msg_err)
 7705
 7706                                else:
 7707                                    log.warning(
 7708                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7709                                    )
 7710
 7711                        # PZTags
 7712                        if (
 7713                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7714                            in list_of_pzfields
 7715                        ):
 7716
 7717                            # Create PZFalgs value
 7718                            pztags_value = ""
 7719                            pztags_sep_default = ","
 7720                            pztags_sep = ""
 7721                            for pzfield in pzfields:
 7722                                if pzfield not in [f"{pz_prefix}Tags"]:
 7723                                    if (
 7724                                        f"{pzfield}{pzfields_sep}{profile}"
 7725                                        in list_of_pzfields
 7726                                    ):
 7727                                        if pzfield in [f"{pz_prefix}Flag"]:
 7728                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7729                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7730                                                    THEN 'PASS'
 7731                                                    ELSE 'FILTERED'
 7732                                                END, '"""
 7733                                        elif pzfield in [f"{pz_prefix}Class"]:
 7734                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7735                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7736                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7737                                                    ELSE '.'
 7738                                                END, '"""
 7739                                        else:
 7740                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7741                                        pztags_sep = pztags_sep_default
 7742
 7743                            # Add Query update for PZFlags
 7744                            sql_update_pztags = f"""
 7745                                UPDATE {table_variants}
 7746                                SET INFO = concat(
 7747                                        INFO,
 7748                                        CASE WHEN INFO NOT in ('','.')
 7749                                                THEN ';'
 7750                                                ELSE ''
 7751                                        END,
 7752                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7753                                    )
 7754                                """
 7755                            sql_queries.append(sql_update_pztags)
 7756
 7757                            # Add Query update for PZFlags for default
 7758                            if profile == default_profile:
 7759                                sql_update_pztags_default = f"""
 7760                                UPDATE {table_variants}
 7761                                SET INFO = concat(
 7762                                        INFO,
 7763                                        ';',
 7764                                        '{pz_prefix}Tags={pztags_value}'
 7765                                    )
 7766                                """
 7767                                sql_queries.append(sql_update_pztags_default)
 7768
 7769                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7770
 7771                        if sql_queries:
 7772
 7773                            for sql_query in sql_queries:
 7774                                log.debug(
 7775                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7776                                )
 7777                                self.conn.execute(sql_query)
 7778
 7779                        log.info(f"""Profile '{profile}' - Update... """)
 7780                        sql_query_update = f"""
 7781                            UPDATE {table_variants}
 7782                            SET INFO =  
 7783                                concat(
 7784                                    CASE
 7785                                        WHEN INFO NOT IN ('','.')
 7786                                        THEN concat(INFO, ';')
 7787                                        ELSE ''
 7788                                    END
 7789                                    {sql_set_info_option}
 7790                                )
 7791                        """
 7792                        self.conn.execute(sql_query_update)
 7793
 7794        else:
 7795
 7796            log.warning(f"No profiles in parameters")
 7797
 7798        # Remove added columns
 7799        for added_column in added_columns:
 7800            self.drop_column(column=added_column)
 7801
 7802        # Explode INFOS fields into table fields
 7803        if self.get_explode_infos():
 7804            self.explode_infos(
 7805                prefix=self.get_explode_infos_prefix(),
 7806                fields=self.get_explode_infos_fields(),
 7807                force=True,
 7808            )
 7809
 7810        return True
 7811
 7812    ###
 7813    # HGVS
 7814    ###
 7815
 7816    def annotation_hgvs(self, threads: int = None) -> None:
 7817        """
 7818        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7819        coordinates and alleles.
 7820
 7821        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7822        threads to use for parallel processing. If no value is provided, it will default to the number
 7823        of threads obtained from the `get_threads()` method
 7824        :type threads: int
 7825        """
 7826
 7827        # Function for each partition of the Dask Dataframe
 7828        def partition_function(partition):
 7829            """
 7830            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7831            each row of a DataFrame called `partition`.
 7832
 7833            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7834            to be processed
 7835            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7836            the "partition" dataframe along the axis 1.
 7837            """
 7838            return partition.apply(annotation_hgvs_partition, axis=1)
 7839
 7840        def annotation_hgvs_partition(row) -> str:
 7841            """
 7842            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7843            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7844
 7845            :param row: A dictionary-like object that contains the values for the following keys:
 7846            :return: a string that contains the HGVS names associated with the given row of data.
 7847            """
 7848
 7849            chr = row["CHROM"]
 7850            pos = row["POS"]
 7851            ref = row["REF"]
 7852            alt = row["ALT"]
 7853
 7854            # Find list of associated transcripts
 7855            transcripts_list = list(
 7856                polars_conn.execute(
 7857                    f"""
 7858                SELECT transcript
 7859                FROM refseq_df
 7860                WHERE CHROM='{chr}'
 7861                AND POS={pos}
 7862            """
 7863                )["transcript"]
 7864            )
 7865
 7866            # Full HGVS annotation in list
 7867            hgvs_full_list = []
 7868
 7869            for transcript_name in transcripts_list:
 7870
 7871                # Transcript
 7872                transcript = get_transcript(
 7873                    transcripts=transcripts, transcript_name=transcript_name
 7874                )
 7875                # Exon
 7876                if use_exon:
 7877                    exon = transcript.find_exon_number(pos)
 7878                else:
 7879                    exon = None
 7880                # Protein
 7881                transcript_protein = None
 7882                if use_protein or add_protein or full_format:
 7883                    transcripts_protein = list(
 7884                        polars_conn.execute(
 7885                            f"""
 7886                        SELECT protein
 7887                        FROM refseqlink_df
 7888                        WHERE transcript='{transcript_name}'
 7889                        LIMIT 1
 7890                    """
 7891                        )["protein"]
 7892                    )
 7893                    if len(transcripts_protein):
 7894                        transcript_protein = transcripts_protein[0]
 7895
 7896                # HGVS name
 7897                hgvs_name = format_hgvs_name(
 7898                    chr,
 7899                    pos,
 7900                    ref,
 7901                    alt,
 7902                    genome=genome,
 7903                    transcript=transcript,
 7904                    transcript_protein=transcript_protein,
 7905                    exon=exon,
 7906                    use_gene=use_gene,
 7907                    use_protein=use_protein,
 7908                    full_format=full_format,
 7909                    use_version=use_version,
 7910                    codon_type=codon_type,
 7911                )
 7912                hgvs_full_list.append(hgvs_name)
 7913                if add_protein and not use_protein and not full_format:
 7914                    hgvs_name = format_hgvs_name(
 7915                        chr,
 7916                        pos,
 7917                        ref,
 7918                        alt,
 7919                        genome=genome,
 7920                        transcript=transcript,
 7921                        transcript_protein=transcript_protein,
 7922                        exon=exon,
 7923                        use_gene=use_gene,
 7924                        use_protein=True,
 7925                        full_format=False,
 7926                        use_version=use_version,
 7927                        codon_type=codon_type,
 7928                    )
 7929                    hgvs_full_list.append(hgvs_name)
 7930
 7931            # Create liste of HGVS annotations
 7932            hgvs_full = ",".join(hgvs_full_list)
 7933
 7934            return hgvs_full
 7935
 7936        # Polars connexion
 7937        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7938
 7939        # Config
 7940        config = self.get_config()
 7941
 7942        # Databases
 7943        # Genome
 7944        databases_genomes_folders = (
 7945            config.get("folders", {})
 7946            .get("databases", {})
 7947            .get("genomes", DEFAULT_GENOME_FOLDER)
 7948        )
 7949        databases_genome = (
 7950            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7951        )
 7952        # refseq database folder
 7953        databases_refseq_folders = (
 7954            config.get("folders", {})
 7955            .get("databases", {})
 7956            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7957        )
 7958        # refseq
 7959        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7960        # refSeqLink
 7961        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7962
 7963        # Param
 7964        param = self.get_param()
 7965
 7966        # Quick HGVS
 7967        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7968            log.info(f"Quick HGVS Annotation:")
 7969            if not param.get("hgvs", None):
 7970                param["hgvs"] = {}
 7971            for option in param.get("hgvs_options", "").split(","):
 7972                option_var_val = option.split("=")
 7973                option_var = option_var_val[0]
 7974                if len(option_var_val) > 1:
 7975                    option_val = option_var_val[1]
 7976                else:
 7977                    option_val = "True"
 7978                if option_val.upper() in ["TRUE"]:
 7979                    option_val = True
 7980                elif option_val.upper() in ["FALSE"]:
 7981                    option_val = False
 7982                log.info(f"   {option_var}={option_val}")
 7983                param["hgvs"][option_var] = option_val
 7984
 7985        # Check if HGVS annotation enabled
 7986        if "hgvs" in param:
 7987            log.info(f"HGVS Annotation... ")
 7988            for hgvs_option in param.get("hgvs", {}):
 7989                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7990        else:
 7991            return
 7992
 7993        # HGVS Param
 7994        param_hgvs = param.get("hgvs", {})
 7995        use_exon = param_hgvs.get("use_exon", False)
 7996        use_gene = param_hgvs.get("use_gene", False)
 7997        use_protein = param_hgvs.get("use_protein", False)
 7998        add_protein = param_hgvs.get("add_protein", False)
 7999        full_format = param_hgvs.get("full_format", False)
 8000        use_version = param_hgvs.get("use_version", False)
 8001        codon_type = param_hgvs.get("codon_type", "3")
 8002
 8003        # refSseq refSeqLink
 8004        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 8005        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 8006
 8007        # Assembly
 8008        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 8009
 8010        # Genome
 8011        genome_file = None
 8012        if find_genome(databases_genome):
 8013            genome_file = find_genome(databases_genome)
 8014        else:
 8015            genome_file = find_genome(
 8016                genome_path=databases_genomes_folders, assembly=assembly
 8017            )
 8018        log.debug("Genome: " + str(genome_file))
 8019
 8020        # refSseq
 8021        refseq_file = find_file_prefix(
 8022            input_file=databases_refseq,
 8023            prefix="ncbiRefSeq",
 8024            folder=databases_refseq_folders,
 8025            assembly=assembly,
 8026        )
 8027        log.debug("refSeq: " + str(refseq_file))
 8028
 8029        # refSeqLink
 8030        refseqlink_file = find_file_prefix(
 8031            input_file=databases_refseqlink,
 8032            prefix="ncbiRefSeqLink",
 8033            folder=databases_refseq_folders,
 8034            assembly=assembly,
 8035        )
 8036        log.debug("refSeqLink: " + str(refseqlink_file))
 8037
 8038        # Threads
 8039        if not threads:
 8040            threads = self.get_threads()
 8041        log.debug("Threads: " + str(threads))
 8042
 8043        # Variables
 8044        table_variants = self.get_table_variants(clause="update")
 8045
 8046        # Get variants SNV and InDel only
 8047        query_variants = f"""
 8048            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8049            FROM {table_variants}
 8050            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8051            """
 8052        df_variants = self.get_query_to_df(query_variants)
 8053
 8054        # Added columns
 8055        added_columns = []
 8056
 8057        # Add hgvs column in variants table
 8058        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8059        added_column = self.add_column(
 8060            table_variants, hgvs_column_name, "STRING", default_value=None
 8061        )
 8062        added_columns.append(added_column)
 8063
 8064        log.debug(f"refSeq loading...")
 8065        # refSeq in duckDB
 8066        refseq_table = get_refseq_table(
 8067            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8068        )
 8069        # Loading all refSeq in Dataframe
 8070        refseq_query = f"""
 8071            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8072            FROM {refseq_table}
 8073            JOIN df_variants ON (
 8074                {refseq_table}.chrom = df_variants.CHROM
 8075                AND {refseq_table}.txStart<=df_variants.POS
 8076                AND {refseq_table}.txEnd>=df_variants.POS
 8077            )
 8078        """
 8079        refseq_df = self.conn.query(refseq_query).pl()
 8080
 8081        if refseqlink_file:
 8082            log.debug(f"refSeqLink loading...")
 8083            # refSeqLink in duckDB
 8084            refseqlink_table = get_refseq_table(
 8085                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8086            )
 8087            # Loading all refSeqLink in Dataframe
 8088            protacc_column = "protAcc_with_ver"
 8089            mrnaacc_column = "mrnaAcc_with_ver"
 8090            refseqlink_query = f"""
 8091                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8092                FROM {refseqlink_table} 
 8093                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8094                WHERE protAcc_without_ver IS NOT NULL
 8095            """
 8096            # Polars Dataframe
 8097            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8098
 8099        # Read RefSeq transcripts into a python dict/model.
 8100        log.debug(f"Transcripts loading...")
 8101        with tempfile.TemporaryDirectory() as tmpdir:
 8102            transcripts_query = f"""
 8103                COPY (
 8104                    SELECT {refseq_table}.*
 8105                    FROM {refseq_table}
 8106                    JOIN df_variants ON (
 8107                        {refseq_table}.chrom=df_variants.CHROM
 8108                        AND {refseq_table}.txStart<=df_variants.POS
 8109                        AND {refseq_table}.txEnd>=df_variants.POS
 8110                    )
 8111                )
 8112                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8113            """
 8114            self.conn.query(transcripts_query)
 8115            with open(f"{tmpdir}/transcript.tsv") as infile:
 8116                transcripts = read_transcripts(infile)
 8117
 8118        # Polars connexion
 8119        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8120
 8121        log.debug("Genome loading...")
 8122        # Read genome sequence using pyfaidx.
 8123        genome = Fasta(genome_file)
 8124
 8125        log.debug("Start annotation HGVS...")
 8126
 8127        # Create
 8128        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8129        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8130
 8131        # Use dask.dataframe.apply() to apply function on each partition
 8132        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8133
 8134        # Convert Dask DataFrame to Pandas Dataframe
 8135        df = ddf.compute()
 8136
 8137        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8138        with tempfile.TemporaryDirectory() as tmpdir:
 8139            df_parquet = os.path.join(tmpdir, "df.parquet")
 8140            df.to_parquet(df_parquet)
 8141
 8142            # Update hgvs column
 8143            update_variant_query = f"""
 8144                UPDATE {table_variants}
 8145                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8146                FROM read_parquet('{df_parquet}') as df
 8147                WHERE variants."#CHROM" = df.CHROM
 8148                AND variants.POS = df.POS
 8149                AND variants.REF = df.REF
 8150                AND variants.ALT = df.ALT
 8151                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8152                """
 8153            self.execute_query(update_variant_query)
 8154
 8155        # Update INFO column
 8156        sql_query_update = f"""
 8157            UPDATE {table_variants}
 8158            SET INFO = 
 8159                concat(
 8160                    CASE 
 8161                        WHEN INFO NOT IN ('','.')
 8162                        THEN concat(INFO, ';')
 8163                        ELSE ''
 8164                    END,
 8165                    'hgvs=',
 8166                    {hgvs_column_name}
 8167                )
 8168            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8169            """
 8170        self.execute_query(sql_query_update)
 8171
 8172        # Add header
 8173        HGVS_INFOS = {
 8174            "hgvs": {
 8175                "ID": "hgvs",
 8176                "Number": ".",
 8177                "Type": "String",
 8178                "Description": f"HGVS annotatation with HOWARD",
 8179            }
 8180        }
 8181
 8182        for field in HGVS_INFOS:
 8183            field_ID = HGVS_INFOS[field]["ID"]
 8184            field_description = HGVS_INFOS[field]["Description"]
 8185            self.get_header().infos[field_ID] = vcf.parser._Info(
 8186                field_ID,
 8187                HGVS_INFOS[field]["Number"],
 8188                HGVS_INFOS[field]["Type"],
 8189                field_description,
 8190                "unknown",
 8191                "unknown",
 8192                code_type_map[HGVS_INFOS[field]["Type"]],
 8193            )
 8194
 8195        # Remove added columns
 8196        for added_column in added_columns:
 8197            self.drop_column(column=added_column)
 8198
 8199    ###
 8200    # Calculation
 8201    ###
 8202
 8203    def get_operations_help(
 8204        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8205    ) -> list:
 8206
 8207        # Init
 8208        operations_help = []
 8209
 8210        # operations
 8211        operations = self.get_config_json(
 8212            name="calculations",
 8213            config_dict=operations_config_dict,
 8214            config_file=operations_config_file,
 8215        )
 8216        for op in operations:
 8217            op_name = operations[op].get("name", op).upper()
 8218            op_description = operations[op].get("description", op_name)
 8219            op_available = operations[op].get("available", False)
 8220            if op_available:
 8221                operations_help.append(f"   {op_name}: {op_description}")
 8222
 8223        # Sort operations
 8224        operations_help.sort()
 8225
 8226        # insert header
 8227        operations_help.insert(0, "Available calculation operations:")
 8228
 8229        # Return
 8230        return operations_help
 8231
 8232    def calculation(
 8233        self,
 8234        operations: dict = {},
 8235        operations_config_dict: dict = {},
 8236        operations_config_file: str = None,
 8237    ) -> None:
 8238        """
 8239        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8240        operation, and then calls the appropriate function
 8241
 8242        param json example:
 8243            "calculation": {
 8244                "NOMEN": {
 8245                    "options": {
 8246                        "hgvs_field": "hgvs"
 8247                    },
 8248                "middle" : null
 8249            }
 8250        """
 8251
 8252        # Param
 8253        param = self.get_param()
 8254
 8255        # CHeck operations config file
 8256        if operations_config_file is None:
 8257            operations_config_file = param.get("calculation", {}).get(
 8258                "calculation_config", None
 8259            )
 8260
 8261        # operations config
 8262        operations_config = self.get_config_json(
 8263            name="calculations",
 8264            config_dict=operations_config_dict,
 8265            config_file=operations_config_file,
 8266        )
 8267
 8268        # Upper keys
 8269        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8270
 8271        # Calculations
 8272
 8273        # Operations from param
 8274        operations = param.get("calculation", {}).get("calculations", operations)
 8275
 8276        # Quick calculation - add
 8277        if param.get("calculations", None):
 8278
 8279            # List of operations
 8280            calculations_list = [
 8281                value.strip() for value in param.get("calculations", "").split(",")
 8282            ]
 8283
 8284            # Log
 8285            log.info(f"Quick Calculations:")
 8286            for calculation_key in calculations_list:
 8287                log.info(f"   {calculation_key}")
 8288
 8289            # Create tmp operations (to keep operation order)
 8290            operations_tmp = {}
 8291            for calculation_operation in calculations_list:
 8292                if calculation_operation.upper() not in operations_tmp:
 8293                    log.debug(
 8294                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8295                    )
 8296                    operations_tmp[calculation_operation.upper()] = {}
 8297                    add_value_into_dict(
 8298                        dict_tree=operations_tmp,
 8299                        sections=[
 8300                            calculation_operation.upper(),
 8301                        ],
 8302                        value=operations.get(calculation_operation.upper(), {}),
 8303                    )
 8304            # Add operations already in param
 8305            for calculation_operation in operations:
 8306                if calculation_operation not in operations_tmp:
 8307                    operations_tmp[calculation_operation] = operations.get(
 8308                        calculation_operation, {}
 8309                    )
 8310
 8311            # Update operations in param
 8312            operations = operations_tmp
 8313
 8314        # Operations for calculation
 8315        if not operations:
 8316            operations = param.get("calculation", {}).get("calculations", {})
 8317
 8318        if operations:
 8319            log.info(f"Calculations...")
 8320
 8321        # For each operations
 8322        for operation_name in operations:
 8323            operation_name = operation_name.upper()
 8324            if operation_name not in [""]:
 8325                if operation_name in operations_config:
 8326                    log.info(f"Calculation '{operation_name}'")
 8327                    operation = operations_config[operation_name]
 8328                    operation_type = operation.get("type", "sql")
 8329                    if operation_type == "python":
 8330                        self.calculation_process_function(
 8331                            operation=operation, operation_name=operation_name
 8332                        )
 8333                    elif operation_type == "sql":
 8334                        self.calculation_process_sql(
 8335                            operation=operation, operation_name=operation_name
 8336                        )
 8337                    else:
 8338                        log.error(
 8339                            f"Operations config: Type '{operation_type}' NOT available"
 8340                        )
 8341                        raise ValueError(
 8342                            f"Operations config: Type '{operation_type}' NOT available"
 8343                        )
 8344                else:
 8345                    log.error(
 8346                        f"Operations config: Calculation '{operation_name}' NOT available"
 8347                    )
 8348                    raise ValueError(
 8349                        f"Operations config: Calculation '{operation_name}' NOT available"
 8350                    )
 8351
 8352        # Explode INFOS fields into table fields
 8353        if self.get_explode_infos():
 8354            self.explode_infos(
 8355                prefix=self.get_explode_infos_prefix(),
 8356                fields=self.get_explode_infos_fields(),
 8357                force=True,
 8358            )
 8359
 8360    def calculation_process_sql(
 8361        self, operation: dict, operation_name: str = "unknown"
 8362    ) -> None:
 8363        """
 8364        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8365        performs the operation, updating the specified table with the result.
 8366
 8367        :param operation: The `operation` parameter is a dictionary that contains information about the
 8368        mathematical operation to be performed. It includes the following keys:
 8369        :type operation: dict
 8370        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8371        the mathematical operation being performed. It is used for logging and error handling purposes,
 8372        defaults to unknown
 8373        :type operation_name: str (optional)
 8374        """
 8375
 8376        # Operation infos
 8377        operation_name = operation.get("name", "unknown")
 8378        log.debug(f"process SQL {operation_name}")
 8379        output_column_name = operation.get("output_column_name", operation_name)
 8380        output_column_type = operation.get("output_column_type", "String")
 8381        prefix = operation.get("explode_infos_prefix", "")
 8382        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8383        output_column_description = operation.get(
 8384            "output_column_description", f"{operation_name} operation"
 8385        )
 8386        operation_query = operation.get("operation_query", None)
 8387        if isinstance(operation_query, list):
 8388            operation_query = " ".join(operation_query)
 8389        operation_info_fields = operation.get("info_fields", [])
 8390        operation_info_fields_check = operation.get("info_fields_check", False)
 8391        operation_info = operation.get("operation_info", True)
 8392        operation_table = operation.get(
 8393            "table", self.get_table_variants(clause="alter")
 8394        )
 8395
 8396        # table variants
 8397        if operation_table:
 8398            table_variants = operation_table
 8399        else:
 8400            table_variants = self.get_table_variants(clause="alter")
 8401
 8402        if operation_query:
 8403
 8404            # Info fields check
 8405            operation_info_fields_check_result = True
 8406            if operation_info_fields_check:
 8407                header_infos = self.get_header().infos
 8408                for info_field in operation_info_fields:
 8409                    operation_info_fields_check_result = (
 8410                        operation_info_fields_check_result
 8411                        and info_field in header_infos
 8412                    )
 8413
 8414            # If info fields available
 8415            if operation_info_fields_check_result:
 8416
 8417                # Added_columns
 8418                added_columns = []
 8419
 8420                # Create VCF header field
 8421                vcf_reader = self.get_header()
 8422                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8423                    output_column_name,
 8424                    ".",
 8425                    output_column_type,
 8426                    output_column_description,
 8427                    "howard calculation",
 8428                    "0",
 8429                    self.code_type_map.get(output_column_type),
 8430                )
 8431
 8432                # Explode infos if needed
 8433                log.debug(f"calculation_process_sql prefix {prefix}")
 8434                added_columns += self.explode_infos(
 8435                    prefix=prefix,
 8436                    fields=[output_column_name] + operation_info_fields,
 8437                    force=False,
 8438                    table=table_variants,
 8439                )
 8440
 8441                # Create column
 8442                added_column = self.add_column(
 8443                    table_name=table_variants,
 8444                    column_name=prefix + output_column_name,
 8445                    column_type=output_column_type_sql,
 8446                    default_value="null",
 8447                )
 8448                added_columns.append(added_column)
 8449
 8450                # Operation calculation
 8451                try:
 8452
 8453                    # Query to update calculation column
 8454                    sql_update = f"""
 8455                        UPDATE {table_variants}
 8456                        SET "{prefix}{output_column_name}" = ({operation_query})
 8457                    """
 8458                    self.conn.execute(sql_update)
 8459
 8460                    # Add to INFO
 8461                    if operation_info:
 8462                        sql_update_info = f"""
 8463                            UPDATE {table_variants}
 8464                            SET "INFO" =
 8465                                concat(
 8466                                    CASE
 8467                                        WHEN "INFO" IS NOT NULL
 8468                                        THEN concat("INFO", ';')
 8469                                        ELSE ''
 8470                                    END,
 8471                                    '{output_column_name}=',
 8472                                    "{prefix}{output_column_name}"
 8473                                )
 8474                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8475                        """
 8476                        self.conn.execute(sql_update_info)
 8477
 8478                except:
 8479                    log.error(
 8480                        f"Operations config: Calculation '{operation_name}' query failed"
 8481                    )
 8482                    raise ValueError(
 8483                        f"Operations config: Calculation '{operation_name}' query failed"
 8484                    )
 8485
 8486                # Remove added columns
 8487                for added_column in added_columns:
 8488                    log.debug(f"added_column: {added_column}")
 8489                    self.drop_column(column=added_column)
 8490
 8491            else:
 8492                log.error(
 8493                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8494                )
 8495                raise ValueError(
 8496                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8497                )
 8498
 8499        else:
 8500            log.error(
 8501                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8502            )
 8503            raise ValueError(
 8504                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8505            )
 8506
 8507    def calculation_process_function(
 8508        self, operation: dict, operation_name: str = "unknown"
 8509    ) -> None:
 8510        """
 8511        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8512        function with the given parameters.
 8513
 8514        :param operation: The `operation` parameter is a dictionary that contains information about the
 8515        operation to be performed. It has the following keys:
 8516        :type operation: dict
 8517        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8518        the operation being performed. It is used for logging purposes, defaults to unknown
 8519        :type operation_name: str (optional)
 8520        """
 8521
 8522        operation_name = operation["name"]
 8523        log.debug(f"process Python {operation_name}")
 8524        function_name = operation["function_name"]
 8525        function_params = operation["function_params"]
 8526        getattr(self, function_name)(*function_params)
 8527
 8528    def calculation_variant_id(self) -> None:
 8529        """
 8530        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8531        updates the INFO field of a variants table with the variant ID.
 8532        """
 8533
 8534        # variant_id annotation field
 8535        variant_id_tag = self.get_variant_id_column()
 8536        added_columns = [variant_id_tag]
 8537
 8538        # variant_id hgvs tags"
 8539        vcf_infos_tags = {
 8540            variant_id_tag: "howard variant ID annotation",
 8541        }
 8542
 8543        # Variants table
 8544        table_variants = self.get_table_variants()
 8545
 8546        # Header
 8547        vcf_reader = self.get_header()
 8548
 8549        # Add variant_id to header
 8550        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8551            variant_id_tag,
 8552            ".",
 8553            "String",
 8554            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8555            "howard calculation",
 8556            "0",
 8557            self.code_type_map.get("String"),
 8558        )
 8559
 8560        # Update
 8561        sql_update = f"""
 8562            UPDATE {table_variants}
 8563            SET "INFO" = 
 8564                concat(
 8565                    CASE
 8566                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8567                        THEN ''
 8568                        ELSE concat("INFO", ';')
 8569                    END,
 8570                    '{variant_id_tag}=',
 8571                    "{variant_id_tag}"
 8572                )
 8573        """
 8574        self.conn.execute(sql_update)
 8575
 8576        # Remove added columns
 8577        for added_column in added_columns:
 8578            self.drop_column(column=added_column)
 8579
 8580    def calculation_extract_snpeff_hgvs(
 8581        self,
 8582        snpeff_hgvs: str = "snpeff_hgvs",
 8583        snpeff_field: str = "ANN",
 8584    ) -> None:
 8585        """
 8586        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8587        annotation field in a VCF file and adds them as a new column in the variants table.
 8588
 8589        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8590        function is used to specify the name of the column that will store the HGVS nomenclatures
 8591        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8592        snpeff_hgvs
 8593        :type snpeff_hgvs: str (optional)
 8594        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8595        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8596        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8597        to ANN
 8598        :type snpeff_field: str (optional)
 8599        """
 8600
 8601        # Snpeff hgvs tags
 8602        vcf_infos_tags = {
 8603            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8604        }
 8605
 8606        # Prefix
 8607        prefix = self.get_explode_infos_prefix()
 8608        if prefix:
 8609            prefix = "INFO/"
 8610
 8611        # snpEff fields
 8612        speff_ann_infos = prefix + snpeff_field
 8613        speff_hgvs_infos = prefix + snpeff_hgvs
 8614
 8615        # Variants table
 8616        table_variants = self.get_table_variants()
 8617
 8618        # Header
 8619        vcf_reader = self.get_header()
 8620
 8621        # Add columns
 8622        added_columns = []
 8623
 8624        # Explode HGVS field in column
 8625        added_columns += self.explode_infos(fields=[snpeff_field])
 8626
 8627        if snpeff_field in vcf_reader.infos:
 8628
 8629            log.debug(vcf_reader.infos[snpeff_field])
 8630
 8631            # Extract ANN header
 8632            ann_description = vcf_reader.infos[snpeff_field].desc
 8633            pattern = r"'(.+?)'"
 8634            match = re.search(pattern, ann_description)
 8635            if match:
 8636                ann_header_match = match.group(1).split(" | ")
 8637                ann_header_desc = {}
 8638                for i in range(len(ann_header_match)):
 8639                    ann_header_info = "".join(
 8640                        char for char in ann_header_match[i] if char.isalnum()
 8641                    )
 8642                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8643                if not ann_header_desc:
 8644                    raise ValueError("Invalid header description format")
 8645            else:
 8646                raise ValueError("Invalid header description format")
 8647
 8648            # Create variant id
 8649            variant_id_column = self.get_variant_id_column()
 8650            added_columns += [variant_id_column]
 8651
 8652            # Create dataframe
 8653            dataframe_snpeff_hgvs = self.get_query_to_df(
 8654                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8655            )
 8656
 8657            # Create main NOMEN column
 8658            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8659                speff_ann_infos
 8660            ].apply(
 8661                lambda x: extract_snpeff_hgvs(
 8662                    str(x), header=list(ann_header_desc.values())
 8663                )
 8664            )
 8665
 8666            # Add snpeff_hgvs to header
 8667            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8668                snpeff_hgvs,
 8669                ".",
 8670                "String",
 8671                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8672                "howard calculation",
 8673                "0",
 8674                self.code_type_map.get("String"),
 8675            )
 8676
 8677            # Update
 8678            sql_update = f"""
 8679                UPDATE variants
 8680                SET "INFO" = 
 8681                    concat(
 8682                        CASE
 8683                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8684                            THEN ''
 8685                            ELSE concat("INFO", ';')
 8686                        END,
 8687                        CASE 
 8688                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8689                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8690                            THEN concat(
 8691                                    '{snpeff_hgvs}=',
 8692                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8693                                )
 8694                            ELSE ''
 8695                        END
 8696                    )
 8697                FROM dataframe_snpeff_hgvs
 8698                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8699
 8700            """
 8701            self.conn.execute(sql_update)
 8702
 8703            # Delete dataframe
 8704            del dataframe_snpeff_hgvs
 8705            gc.collect()
 8706
 8707        else:
 8708
 8709            log.warning(
 8710                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8711            )
 8712
 8713        # Remove added columns
 8714        for added_column in added_columns:
 8715            self.drop_column(column=added_column)
 8716
 8717    def calculation_snpeff_ann_explode(
 8718        self,
 8719        uniquify: bool = True,
 8720        output_format: str = "fields",
 8721        output_prefix: str = "snpeff_",
 8722        snpeff_field: str = "ANN",
 8723    ) -> None:
 8724        """
 8725        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8726        exploding the HGVS field and updating variant information accordingly.
 8727
 8728        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8729        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8730        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8731        defaults to True
 8732        :type uniquify: bool (optional)
 8733        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8734        function specifies the format in which the output annotations will be generated. It has a
 8735        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8736        format, defaults to fields
 8737        :type output_format: str (optional)
 8738        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8739        method is used to specify the prefix that will be added to the output annotations generated
 8740        during the calculation process. This prefix helps to differentiate the newly added annotations
 8741        from existing ones in the output data. By default, the, defaults to ANN_
 8742        :type output_prefix: str (optional)
 8743        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8744        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8745        field will be processed to explode the HGVS annotations and update the variant information
 8746        accordingly, defaults to ANN
 8747        :type snpeff_field: str (optional)
 8748        """
 8749
 8750        # SnpEff annotation field
 8751        snpeff_hgvs = "snpeff_ann_explode"
 8752
 8753        # Snpeff hgvs tags
 8754        vcf_infos_tags = {
 8755            snpeff_hgvs: "Explode snpEff annotations",
 8756        }
 8757
 8758        # Prefix
 8759        prefix = self.get_explode_infos_prefix()
 8760        if prefix:
 8761            prefix = "INFO/"
 8762
 8763        # snpEff fields
 8764        speff_ann_infos = prefix + snpeff_field
 8765        speff_hgvs_infos = prefix + snpeff_hgvs
 8766
 8767        # Variants table
 8768        table_variants = self.get_table_variants()
 8769
 8770        # Header
 8771        vcf_reader = self.get_header()
 8772
 8773        # Add columns
 8774        added_columns = []
 8775
 8776        # Explode HGVS field in column
 8777        added_columns += self.explode_infos(fields=[snpeff_field])
 8778        log.debug(f"snpeff_field={snpeff_field}")
 8779        log.debug(f"added_columns={added_columns}")
 8780
 8781        if snpeff_field in vcf_reader.infos:
 8782
 8783            # Extract ANN header
 8784            ann_description = vcf_reader.infos[snpeff_field].desc
 8785            pattern = r"'(.+?)'"
 8786            match = re.search(pattern, ann_description)
 8787            if match:
 8788                ann_header_match = match.group(1).split(" | ")
 8789                ann_header = []
 8790                ann_header_desc = {}
 8791                for i in range(len(ann_header_match)):
 8792                    ann_header_info = "".join(
 8793                        char for char in ann_header_match[i] if char.isalnum()
 8794                    )
 8795                    ann_header.append(ann_header_info)
 8796                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8797                if not ann_header_desc:
 8798                    raise ValueError("Invalid header description format")
 8799            else:
 8800                raise ValueError("Invalid header description format")
 8801
 8802            # Create variant id
 8803            variant_id_column = self.get_variant_id_column()
 8804            added_columns += [variant_id_column]
 8805
 8806            # Create dataframe
 8807            dataframe_snpeff_hgvs = self.get_query_to_df(
 8808                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8809            )
 8810
 8811            # Create snpEff columns
 8812            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8813                speff_ann_infos
 8814            ].apply(
 8815                lambda x: explode_snpeff_ann(
 8816                    str(x),
 8817                    uniquify=uniquify,
 8818                    output_format=output_format,
 8819                    prefix=output_prefix,
 8820                    header=list(ann_header_desc.values()),
 8821                )
 8822            )
 8823
 8824            # Header
 8825            ann_annotations_prefix = ""
 8826            if output_format.upper() in ["JSON"]:
 8827                ann_annotations_prefix = f"{output_prefix}="
 8828                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8829                    output_prefix,
 8830                    ".",
 8831                    "String",
 8832                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8833                    + " - JSON format",
 8834                    "howard calculation",
 8835                    "0",
 8836                    self.code_type_map.get("String"),
 8837                )
 8838            else:
 8839                for ann_annotation in ann_header:
 8840                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8841                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8842                        ann_annotation_id,
 8843                        ".",
 8844                        "String",
 8845                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8846                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8847                        "howard calculation",
 8848                        "0",
 8849                        self.code_type_map.get("String"),
 8850                    )
 8851
 8852            # Update
 8853            sql_update = f"""
 8854                UPDATE variants
 8855                SET "INFO" = 
 8856                    concat(
 8857                        CASE
 8858                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8859                            THEN ''
 8860                            ELSE concat("INFO", ';')
 8861                        END,
 8862                        CASE 
 8863                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8864                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8865                            THEN concat(
 8866                                '{ann_annotations_prefix}',
 8867                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8868                                )
 8869                            ELSE ''
 8870                        END
 8871                    )
 8872                FROM dataframe_snpeff_hgvs
 8873                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8874
 8875            """
 8876            self.conn.execute(sql_update)
 8877
 8878            # Delete dataframe
 8879            del dataframe_snpeff_hgvs
 8880            gc.collect()
 8881
 8882        else:
 8883
 8884            log.warning(
 8885                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8886            )
 8887
 8888        # Remove added columns
 8889        for added_column in added_columns:
 8890            self.drop_column(column=added_column)
 8891
 8892    def calculation_extract_nomen(self) -> None:
 8893        """
 8894        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8895        """
 8896
 8897        # NOMEN field
 8898        field_nomen_dict = "NOMEN_DICT"
 8899
 8900        # NOMEN structure
 8901        nomen_dict = {
 8902            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8903            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8904            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8905            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8906            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8907            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8908            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8909            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8910            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8911            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8912        }
 8913
 8914        # Param
 8915        param = self.get_param()
 8916
 8917        # Threads
 8918        threads = self.get_threads()
 8919
 8920        # Prefix
 8921        prefix = self.get_explode_infos_prefix()
 8922
 8923        # Header
 8924        vcf_reader = self.get_header()
 8925
 8926        # Added columns
 8927        added_columns = []
 8928
 8929        # Get HGVS field
 8930        hgvs_field = (
 8931            param.get("calculation", {})
 8932            .get("calculations", {})
 8933            .get("NOMEN", {})
 8934            .get("options", {})
 8935            .get("hgvs_field", "hgvs")
 8936        )
 8937
 8938        # Get NOMEN pattern
 8939        nomen_pattern = (
 8940            param.get("calculation", {})
 8941            .get("calculations", {})
 8942            .get("NOMEN", {})
 8943            .get("options", {})
 8944            .get("pattern", None)
 8945        )
 8946
 8947        # transcripts list of preference sources
 8948        transcripts_sources = {}
 8949
 8950        # Get transcripts
 8951        transcripts_file = (
 8952            param.get("calculation", {})
 8953            .get("calculations", {})
 8954            .get("NOMEN", {})
 8955            .get("options", {})
 8956            .get("transcripts", None)
 8957        )
 8958        transcripts_file = full_path(transcripts_file)
 8959        if transcripts_file:
 8960            if os.path.exists(transcripts_file):
 8961                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8962                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8963                transcripts_sources["file"] = transcripts_from_file
 8964            else:
 8965                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8966                log.error(msg_err)
 8967                raise ValueError(msg_err)
 8968
 8969        # Get transcripts table
 8970        transcripts_table = (
 8971            param.get("calculation", {})
 8972            .get("calculations", {})
 8973            .get("NOMEN", {})
 8974            .get("options", {})
 8975            .get("transcripts_table", self.get_table_variants())
 8976        )
 8977        # Get transcripts column
 8978        transcripts_column = (
 8979            param.get("calculation", {})
 8980            .get("calculations", {})
 8981            .get("NOMEN", {})
 8982            .get("options", {})
 8983            .get("transcripts_column", None)
 8984        )
 8985
 8986        if transcripts_table and transcripts_column:
 8987            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8988            # Explode if not exists
 8989            added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table)
 8990        else:
 8991            extra_field_transcript = f"NULL"
 8992
 8993        # Transcripts of preference source order
 8994        transcripts_order = (
 8995            param.get("calculation", {})
 8996            .get("calculations", {})
 8997            .get("NOMEN", {})
 8998            .get("options", {})
 8999            .get("transcripts_order", ["column", "file"])
 9000        )
 9001
 9002        # Transcripts from file
 9003        transcripts = transcripts_sources.get("file", [])
 9004
 9005        # Explode HGVS field in column
 9006        added_columns += self.explode_infos(fields=[hgvs_field])
 9007
 9008        # extra infos
 9009        extra_infos = self.get_extra_infos()
 9010        extra_field = prefix + hgvs_field
 9011
 9012        if extra_field in extra_infos:
 9013
 9014            # Create dataframe
 9015            dataframe_hgvs = self.get_query_to_df(
 9016                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 9017            )
 9018
 9019            # Transcripts rank
 9020            transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)}
 9021            transcripts_len = len(transcripts_rank)
 9022
 9023            # Create main NOMEN column
 9024            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 9025                lambda x: find_nomen(
 9026                    hgvs=x.hgvs,
 9027                    transcript=x.transcript,
 9028                    transcripts=transcripts_rank,
 9029                    pattern=nomen_pattern,
 9030                    transcripts_source_order=transcripts_order,
 9031                    transcripts_len=transcripts_len
 9032                ),
 9033                axis=1,
 9034            )
 9035
 9036            # Explode NOMEN Structure and create SQL set for update
 9037            sql_nomen_fields = []
 9038            for nomen_field in nomen_dict:
 9039
 9040                # Create VCF header field
 9041                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9042                    nomen_field,
 9043                    ".",
 9044                    "String",
 9045                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9046                    "howard calculation",
 9047                    "0",
 9048                    self.code_type_map.get("String"),
 9049                )
 9050
 9051                # Add field to SQL query update
 9052                sql_nomen_fields.append(
 9053                    f"""
 9054                        CASE 
 9055                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
 9056                            THEN concat(
 9057                                    ';{nomen_field}=',
 9058                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
 9059                                )
 9060                            ELSE ''
 9061                        END
 9062                    """
 9063                )
 9064
 9065            # SQL set for update
 9066            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9067
 9068            # Update
 9069            sql_update = f"""
 9070                UPDATE variants
 9071                SET "INFO" = 
 9072                    concat(
 9073                        CASE
 9074                            WHEN "INFO" IS NULL
 9075                            THEN ''
 9076                            ELSE "INFO"
 9077                        END,
 9078                        {sql_nomen_fields_set}
 9079                    )
 9080                FROM dataframe_hgvs
 9081                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9082                    AND variants."POS" = dataframe_hgvs."POS" 
 9083                    AND variants."REF" = dataframe_hgvs."REF"
 9084                    AND variants."ALT" = dataframe_hgvs."ALT"
 9085            """
 9086            self.conn.execute(sql_update)
 9087
 9088            # Delete dataframe
 9089            del dataframe_hgvs
 9090            gc.collect()
 9091
 9092        # Remove added columns
 9093        for added_column in added_columns:
 9094            self.drop_column(column=added_column)
 9095
 9096    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9097        """
 9098        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9099        pipeline/sample for a variant and updates the variant information in a VCF file.
 9100
 9101        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9102        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9103        VCF header and to update the corresponding field in the variants table, defaults to
 9104        findbypipeline
 9105        :type tag: str (optional)
 9106        """
 9107
 9108        # if FORMAT and samples
 9109        if (
 9110            "FORMAT" in self.get_header_columns_as_list()
 9111            and self.get_header_sample_list()
 9112        ):
 9113
 9114            # findbypipeline annotation field
 9115            findbypipeline_tag = tag
 9116
 9117            # VCF infos tags
 9118            vcf_infos_tags = {
 9119                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9120            }
 9121
 9122            # Prefix
 9123            prefix = self.get_explode_infos_prefix()
 9124
 9125            # Field
 9126            findbypipeline_infos = prefix + findbypipeline_tag
 9127
 9128            # Variants table
 9129            table_variants = self.get_table_variants()
 9130
 9131            # Header
 9132            vcf_reader = self.get_header()
 9133
 9134            # Create variant id
 9135            variant_id_column = self.get_variant_id_column()
 9136            added_columns = [variant_id_column]
 9137
 9138            # variant_id, FORMAT and samples
 9139            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9140                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9141            )
 9142
 9143            # Create dataframe
 9144            dataframe_findbypipeline = self.get_query_to_df(
 9145                f""" SELECT {samples_fields} FROM {table_variants} """
 9146            )
 9147
 9148            # Create findbypipeline column
 9149            dataframe_findbypipeline[findbypipeline_infos] = (
 9150                dataframe_findbypipeline.apply(
 9151                    lambda row: findbypipeline(
 9152                        row, samples=self.get_header_sample_list()
 9153                    ),
 9154                    axis=1,
 9155                )
 9156            )
 9157
 9158            # Add snpeff_hgvs to header
 9159            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9160                findbypipeline_tag,
 9161                ".",
 9162                "String",
 9163                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9164                "howard calculation",
 9165                "0",
 9166                self.code_type_map.get("String"),
 9167            )
 9168
 9169            # Update
 9170            sql_update = f"""
 9171                UPDATE variants
 9172                SET "INFO" = 
 9173                    concat(
 9174                        CASE
 9175                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9176                            THEN ''
 9177                            ELSE concat("INFO", ';')
 9178                        END,
 9179                        CASE 
 9180                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9181                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9182                            THEN concat(
 9183                                    '{findbypipeline_tag}=',
 9184                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9185                                )
 9186                            ELSE ''
 9187                        END
 9188                    )
 9189                FROM dataframe_findbypipeline
 9190                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9191            """
 9192            self.conn.execute(sql_update)
 9193
 9194            # Remove added columns
 9195            for added_column in added_columns:
 9196                self.drop_column(column=added_column)
 9197
 9198            # Delete dataframe
 9199            del dataframe_findbypipeline
 9200            gc.collect()
 9201
 9202    def calculation_genotype_concordance(self) -> None:
 9203        """
 9204        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9205        multi-caller VCF files and updates the variant information in the database.
 9206        """
 9207
 9208        # if FORMAT and samples
 9209        if (
 9210            "FORMAT" in self.get_header_columns_as_list()
 9211            and self.get_header_sample_list()
 9212        ):
 9213
 9214            # genotypeconcordance annotation field
 9215            genotypeconcordance_tag = "genotypeconcordance"
 9216
 9217            # VCF infos tags
 9218            vcf_infos_tags = {
 9219                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9220            }
 9221
 9222            # Prefix
 9223            prefix = self.get_explode_infos_prefix()
 9224
 9225            # Field
 9226            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9227
 9228            # Variants table
 9229            table_variants = self.get_table_variants()
 9230
 9231            # Header
 9232            vcf_reader = self.get_header()
 9233
 9234            # Create variant id
 9235            variant_id_column = self.get_variant_id_column()
 9236            added_columns = [variant_id_column]
 9237
 9238            # variant_id, FORMAT and samples
 9239            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9240                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9241            )
 9242
 9243            # Create dataframe
 9244            dataframe_genotypeconcordance = self.get_query_to_df(
 9245                f""" SELECT {samples_fields} FROM {table_variants} """
 9246            )
 9247
 9248            # Create genotypeconcordance column
 9249            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9250                dataframe_genotypeconcordance.apply(
 9251                    lambda row: genotypeconcordance(
 9252                        row, samples=self.get_header_sample_list()
 9253                    ),
 9254                    axis=1,
 9255                )
 9256            )
 9257
 9258            # Add genotypeconcordance to header
 9259            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9260                genotypeconcordance_tag,
 9261                ".",
 9262                "String",
 9263                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9264                "howard calculation",
 9265                "0",
 9266                self.code_type_map.get("String"),
 9267            )
 9268
 9269            # Update
 9270            sql_update = f"""
 9271                UPDATE variants
 9272                SET "INFO" = 
 9273                    concat(
 9274                        CASE
 9275                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9276                            THEN ''
 9277                            ELSE concat("INFO", ';')
 9278                        END,
 9279                        CASE
 9280                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9281                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9282                            THEN concat(
 9283                                    '{genotypeconcordance_tag}=',
 9284                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9285                                )
 9286                            ELSE ''
 9287                        END
 9288                    )
 9289                FROM dataframe_genotypeconcordance
 9290                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9291            """
 9292            self.conn.execute(sql_update)
 9293
 9294            # Remove added columns
 9295            for added_column in added_columns:
 9296                self.drop_column(column=added_column)
 9297
 9298            # Delete dataframe
 9299            del dataframe_genotypeconcordance
 9300            gc.collect()
 9301
 9302    def calculation_barcode(self, tag: str = "barcode") -> None:
 9303        """
 9304        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9305        updates the INFO field in the file with the calculated barcode values.
 9306
 9307        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9308        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9309        the default tag name is set to "barcode", defaults to barcode
 9310        :type tag: str (optional)
 9311        """
 9312
 9313        # if FORMAT and samples
 9314        if (
 9315            "FORMAT" in self.get_header_columns_as_list()
 9316            and self.get_header_sample_list()
 9317        ):
 9318
 9319            # barcode annotation field
 9320            if not tag:
 9321                tag = "barcode"
 9322
 9323            # VCF infos tags
 9324            vcf_infos_tags = {
 9325                tag: "barcode calculation (VaRank)",
 9326            }
 9327
 9328            # Prefix
 9329            prefix = self.get_explode_infos_prefix()
 9330
 9331            # Field
 9332            barcode_infos = prefix + tag
 9333
 9334            # Variants table
 9335            table_variants = self.get_table_variants()
 9336
 9337            # Header
 9338            vcf_reader = self.get_header()
 9339
 9340            # Create variant id
 9341            variant_id_column = self.get_variant_id_column()
 9342            added_columns = [variant_id_column]
 9343
 9344            # variant_id, FORMAT and samples
 9345            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9346                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9347            )
 9348
 9349            # Create dataframe
 9350            dataframe_barcode = self.get_query_to_df(
 9351                f""" SELECT {samples_fields} FROM {table_variants} """
 9352            )
 9353
 9354            # Create barcode column
 9355            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9356                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9357            )
 9358
 9359            # Add barcode to header
 9360            vcf_reader.infos[tag] = vcf.parser._Info(
 9361                tag,
 9362                ".",
 9363                "String",
 9364                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9365                "howard calculation",
 9366                "0",
 9367                self.code_type_map.get("String"),
 9368            )
 9369
 9370            # Update
 9371            sql_update = f"""
 9372                UPDATE {table_variants}
 9373                SET "INFO" = 
 9374                    concat(
 9375                        CASE
 9376                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9377                            THEN ''
 9378                            ELSE concat("INFO", ';')
 9379                        END,
 9380                        CASE
 9381                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9382                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9383                            THEN concat(
 9384                                    '{tag}=',
 9385                                    dataframe_barcode."{barcode_infos}"
 9386                                )
 9387                            ELSE ''
 9388                        END
 9389                    )
 9390                FROM dataframe_barcode
 9391                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9392            """
 9393            self.conn.execute(sql_update)
 9394
 9395            # Remove added columns
 9396            for added_column in added_columns:
 9397                self.drop_column(column=added_column)
 9398
 9399            # Delete dataframe
 9400            del dataframe_barcode
 9401            gc.collect()
 9402
 9403    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9404        """
 9405        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9406        and updates the INFO field in the file with the calculated barcode values.
 9407
 9408        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9409        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9410        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9411        :type tag: str (optional)
 9412        """
 9413
 9414        # if FORMAT and samples
 9415        if (
 9416            "FORMAT" in self.get_header_columns_as_list()
 9417            and self.get_header_sample_list()
 9418        ):
 9419
 9420            # barcode annotation field
 9421            if not tag:
 9422                tag = "BCF"
 9423
 9424            # VCF infos tags
 9425            vcf_infos_tags = {
 9426                tag: "barcode family calculation",
 9427                f"{tag}S": "barcode family samples",
 9428            }
 9429
 9430            # Param
 9431            param = self.get_param()
 9432            log.debug(f"param={param}")
 9433
 9434            # Prefix
 9435            prefix = self.get_explode_infos_prefix()
 9436
 9437            # PED param
 9438            ped = (
 9439                param.get("calculation", {})
 9440                .get("calculations", {})
 9441                .get("BARCODEFAMILY", {})
 9442                .get("family_pedigree", None)
 9443            )
 9444            log.debug(f"ped={ped}")
 9445
 9446            # Load PED
 9447            if ped:
 9448
 9449                # Pedigree is a file
 9450                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9451                    log.debug("Pedigree is file")
 9452                    with open(full_path(ped)) as ped:
 9453                        ped = yaml.safe_load(ped)
 9454
 9455                # Pedigree is a string
 9456                elif isinstance(ped, str):
 9457                    log.debug("Pedigree is str")
 9458                    try:
 9459                        ped = json.loads(ped)
 9460                        log.debug("Pedigree is json str")
 9461                    except ValueError as e:
 9462                        ped_samples = ped.split(",")
 9463                        ped = {}
 9464                        for ped_sample in ped_samples:
 9465                            ped[ped_sample] = ped_sample
 9466
 9467                # Pedigree is a dict
 9468                elif isinstance(ped, dict):
 9469                    log.debug("Pedigree is dict")
 9470
 9471                # Pedigree is not well formatted
 9472                else:
 9473                    msg_error = "Pedigree not well formatted"
 9474                    log.error(msg_error)
 9475                    raise ValueError(msg_error)
 9476
 9477                # Construct list
 9478                ped_samples = list(ped.values())
 9479
 9480            else:
 9481                log.debug("Pedigree not defined. Take all samples")
 9482                ped_samples = self.get_header_sample_list()
 9483                ped = {}
 9484                for ped_sample in ped_samples:
 9485                    ped[ped_sample] = ped_sample
 9486
 9487            # Check pedigree
 9488            if not ped or len(ped) == 0:
 9489                msg_error = f"Error in pedigree: samples {ped_samples}"
 9490                log.error(msg_error)
 9491                raise ValueError(msg_error)
 9492
 9493            # Log
 9494            log.info(
 9495                "Calculation 'BARCODEFAMILY' - Samples: "
 9496                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9497            )
 9498            log.debug(f"ped_samples={ped_samples}")
 9499
 9500            # Field
 9501            barcode_infos = prefix + tag
 9502
 9503            # Variants table
 9504            table_variants = self.get_table_variants()
 9505
 9506            # Header
 9507            vcf_reader = self.get_header()
 9508
 9509            # Create variant id
 9510            variant_id_column = self.get_variant_id_column()
 9511            added_columns = [variant_id_column]
 9512
 9513            # variant_id, FORMAT and samples
 9514            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9515                [f""" "{sample}" """ for sample in ped_samples]
 9516            )
 9517
 9518            # Create dataframe
 9519            dataframe_barcode = self.get_query_to_df(
 9520                f""" SELECT {samples_fields} FROM {table_variants} """
 9521            )
 9522
 9523            # Create barcode column
 9524            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9525                lambda row: barcode(row, samples=ped_samples), axis=1
 9526            )
 9527
 9528            # Add barcode family to header
 9529            # Add vaf_normalization to header
 9530            vcf_reader.formats[tag] = vcf.parser._Format(
 9531                id=tag,
 9532                num=".",
 9533                type="String",
 9534                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9535                type_code=self.code_type_map.get("String"),
 9536            )
 9537            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9538                id=f"{tag}S",
 9539                num=".",
 9540                type="String",
 9541                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9542                type_code=self.code_type_map.get("String"),
 9543            )
 9544
 9545            # Update
 9546            # for sample in ped_samples:
 9547            sql_update_set = []
 9548            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9549                if sample in ped_samples:
 9550                    value = f'dataframe_barcode."{barcode_infos}"'
 9551                    value_samples = "'" + ",".join([f""" "{sample}" """ for sample in ped_samples]) + "'"
 9552                    ped_samples
 9553                elif sample == "FORMAT":
 9554                    value = f"'{tag}'"
 9555                    value_samples = f"'{tag}S'"
 9556                else:
 9557                    value = "'.'"
 9558                    value_samples = "'.'"
 9559                format_regex = r"[a-zA-Z0-9\s]"
 9560                sql_update_set.append(
 9561                    f"""
 9562                        "{sample}" = 
 9563                        concat(
 9564                            CASE
 9565                                WHEN {table_variants}."{sample}" = './.'
 9566                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9567                                ELSE {table_variants}."{sample}"
 9568                            END,
 9569                            ':',
 9570                            {value},
 9571                            ':',
 9572                            {value_samples}
 9573                        )
 9574                    """
 9575                )
 9576
 9577            sql_update_set_join = ", ".join(sql_update_set)
 9578            sql_update = f"""
 9579                UPDATE {table_variants}
 9580                SET {sql_update_set_join}
 9581                FROM dataframe_barcode
 9582                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9583            """
 9584            self.conn.execute(sql_update)
 9585
 9586            # Remove added columns
 9587            for added_column in added_columns:
 9588                self.drop_column(column=added_column)
 9589
 9590            # Delete dataframe
 9591            del dataframe_barcode
 9592            gc.collect()
 9593
 9594    def calculation_trio(self) -> None:
 9595        """
 9596        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9597        information to the INFO field of each variant.
 9598        """
 9599
 9600        # if FORMAT and samples
 9601        if (
 9602            "FORMAT" in self.get_header_columns_as_list()
 9603            and self.get_header_sample_list()
 9604        ):
 9605
 9606            # trio annotation field
 9607            trio_tag = "trio"
 9608
 9609            # VCF infos tags
 9610            vcf_infos_tags = {
 9611                "trio": "trio calculation",
 9612            }
 9613
 9614            # Param
 9615            param = self.get_param()
 9616
 9617            # Prefix
 9618            prefix = self.get_explode_infos_prefix()
 9619
 9620            # Trio param
 9621            trio_ped = (
 9622                param.get("calculation", {})
 9623                .get("calculations", {})
 9624                .get("TRIO", {})
 9625                .get("trio_pedigree", None)
 9626            )
 9627
 9628            # Load trio
 9629            if trio_ped:
 9630
 9631                # Trio pedigree is a file
 9632                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9633                    log.debug("TRIO pedigree is file")
 9634                    with open(full_path(trio_ped)) as trio_ped:
 9635                        trio_ped = yaml.safe_load(trio_ped)
 9636
 9637                # Trio pedigree is a string
 9638                elif isinstance(trio_ped, str):
 9639                    log.debug("TRIO pedigree is str")
 9640                    try:
 9641                        trio_ped = json.loads(trio_ped)
 9642                        log.debug("TRIO pedigree is json str")
 9643                    except ValueError as e:
 9644                        trio_samples = trio_ped.split(",")
 9645                        if len(trio_samples) == 3:
 9646                            trio_ped = {
 9647                                "father": trio_samples[0],
 9648                                "mother": trio_samples[1],
 9649                                "child": trio_samples[2],
 9650                            }
 9651                            log.debug("TRIO pedigree is list str")
 9652                        else:
 9653                            msg_error = "TRIO pedigree not well formatted"
 9654                            log.error(msg_error)
 9655                            raise ValueError(msg_error)
 9656
 9657                # Trio pedigree is a dict
 9658                elif isinstance(trio_ped, dict):
 9659                    log.debug("TRIO pedigree is dict")
 9660
 9661                # Trio pedigree is not well formatted
 9662                else:
 9663                    msg_error = "TRIO pedigree not well formatted"
 9664                    log.error(msg_error)
 9665                    raise ValueError(msg_error)
 9666
 9667                # Construct trio list
 9668                trio_samples = [
 9669                    trio_ped.get("father", ""),
 9670                    trio_ped.get("mother", ""),
 9671                    trio_ped.get("child", ""),
 9672                ]
 9673
 9674            else:
 9675                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9676                samples_list = self.get_header_sample_list()
 9677                if len(samples_list) >= 3:
 9678                    trio_samples = self.get_header_sample_list()[0:3]
 9679                    trio_ped = {
 9680                        "father": trio_samples[0],
 9681                        "mother": trio_samples[1],
 9682                        "child": trio_samples[2],
 9683                    }
 9684                else:
 9685                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9686                    log.error(msg_error)
 9687                    raise ValueError(msg_error)
 9688
 9689            # Check trio pedigree
 9690            if not trio_ped or len(trio_ped) != 3:
 9691                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9692                log.error(msg_error)
 9693                raise ValueError(msg_error)
 9694
 9695            # Log
 9696            log.info(
 9697                f"Calculation 'TRIO' - Samples: "
 9698                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9699            )
 9700
 9701            # Field
 9702            trio_infos = prefix + trio_tag
 9703
 9704            # Variants table
 9705            table_variants = self.get_table_variants()
 9706
 9707            # Header
 9708            vcf_reader = self.get_header()
 9709
 9710            # Create variant id
 9711            variant_id_column = self.get_variant_id_column()
 9712            added_columns = [variant_id_column]
 9713
 9714            # variant_id, FORMAT and samples
 9715            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9716                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9717            )
 9718
 9719            # Create dataframe
 9720            dataframe_trio = self.get_query_to_df(
 9721                f""" SELECT {samples_fields} FROM {table_variants} """
 9722            )
 9723
 9724            # Create trio column
 9725            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9726                lambda row: trio(row, samples=trio_samples), axis=1
 9727            )
 9728
 9729            # Add trio to header
 9730            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9731                trio_tag,
 9732                ".",
 9733                "String",
 9734                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9735                "howard calculation",
 9736                "0",
 9737                self.code_type_map.get("String"),
 9738            )
 9739
 9740            # Update
 9741            sql_update = f"""
 9742                UPDATE {table_variants}
 9743                SET "INFO" = 
 9744                    concat(
 9745                        CASE
 9746                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9747                            THEN ''
 9748                            ELSE concat("INFO", ';')
 9749                        END,
 9750                        CASE
 9751                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9752                             AND dataframe_trio."{trio_infos}" NOT NULL
 9753                            THEN concat(
 9754                                    '{trio_tag}=',
 9755                                    dataframe_trio."{trio_infos}"
 9756                                )
 9757                            ELSE ''
 9758                        END
 9759                    )
 9760                FROM dataframe_trio
 9761                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9762            """
 9763            self.conn.execute(sql_update)
 9764
 9765            # Remove added columns
 9766            for added_column in added_columns:
 9767                self.drop_column(column=added_column)
 9768
 9769            # Delete dataframe
 9770            del dataframe_trio
 9771            gc.collect()
 9772
 9773    def calculation_vaf_normalization(self) -> None:
 9774        """
 9775        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9776        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9777        :return: The function does not return anything.
 9778        """
 9779
 9780        # if FORMAT and samples
 9781        if (
 9782            "FORMAT" in self.get_header_columns_as_list()
 9783            and self.get_header_sample_list()
 9784        ):
 9785
 9786            # vaf_normalization annotation field
 9787            vaf_normalization_tag = "VAF"
 9788
 9789            # VCF infos tags
 9790            vcf_infos_tags = {
 9791                "VAF": "VAF Variant Frequency",
 9792            }
 9793
 9794            # Prefix
 9795            prefix = self.get_explode_infos_prefix()
 9796
 9797            # Variants table
 9798            table_variants = self.get_table_variants()
 9799
 9800            # Header
 9801            vcf_reader = self.get_header()
 9802
 9803            # Do not calculate if VAF already exists
 9804            if "VAF" in vcf_reader.formats:
 9805                log.debug("VAF already on genotypes")
 9806                return
 9807
 9808            # Create variant id
 9809            variant_id_column = self.get_variant_id_column()
 9810            added_columns = [variant_id_column]
 9811
 9812            # variant_id, FORMAT and samples
 9813            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9814                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9815            )
 9816
 9817            # Create dataframe
 9818            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9819            log.debug(f"query={query}")
 9820            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9821
 9822            vaf_normalization_set = []
 9823
 9824            # for each sample vaf_normalization
 9825            for sample in self.get_header_sample_list():
 9826                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9827                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9828                )
 9829                vaf_normalization_set.append(
 9830                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9831                )
 9832
 9833            # Add VAF to FORMAT
 9834            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9835                "FORMAT"
 9836            ].apply(lambda x: str(x) + ":VAF")
 9837            vaf_normalization_set.append(
 9838                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9839            )
 9840
 9841            # Add vaf_normalization to header
 9842            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9843                id=vaf_normalization_tag,
 9844                num="1",
 9845                type="Float",
 9846                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9847                type_code=self.code_type_map.get("Float"),
 9848            )
 9849
 9850            # Create fields to add in INFO
 9851            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9852
 9853            # Update
 9854            sql_update = f"""
 9855                UPDATE {table_variants}
 9856                SET {sql_vaf_normalization_set}
 9857                FROM dataframe_vaf_normalization
 9858                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9859
 9860            """
 9861            self.conn.execute(sql_update)
 9862
 9863            # Remove added columns
 9864            for added_column in added_columns:
 9865                self.drop_column(column=added_column)
 9866
 9867            # Delete dataframe
 9868            del dataframe_vaf_normalization
 9869            gc.collect()
 9870
 9871    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9872        """
 9873        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9874        field in a VCF file and updates the INFO column of the variants table with the calculated
 9875        statistics.
 9876
 9877        :param info: The `info` parameter is a string that represents the type of information for which
 9878        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9879        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9880        maximum value, the mean, the median, defaults to VAF
 9881        :type info: str (optional)
 9882        """
 9883
 9884        # if FORMAT and samples
 9885        if (
 9886            "FORMAT" in self.get_header_columns_as_list()
 9887            and self.get_header_sample_list()
 9888        ):
 9889
 9890            # vaf_stats annotation field
 9891            vaf_stats_tag = info + "_stats"
 9892
 9893            # VCF infos tags
 9894            vcf_infos_tags = {
 9895                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9896                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9897                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9898                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9899                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9900                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9901                info
 9902                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9903            }
 9904
 9905            # Prefix
 9906            prefix = self.get_explode_infos_prefix()
 9907
 9908            # Field
 9909            vaf_stats_infos = prefix + vaf_stats_tag
 9910
 9911            # Variants table
 9912            table_variants = self.get_table_variants()
 9913
 9914            # Header
 9915            vcf_reader = self.get_header()
 9916
 9917            # Create variant id
 9918            variant_id_column = self.get_variant_id_column()
 9919            added_columns = [variant_id_column]
 9920
 9921            # variant_id, FORMAT and samples
 9922            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9923                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9924            )
 9925
 9926            # Create dataframe
 9927            dataframe_vaf_stats = self.get_query_to_df(
 9928                f""" SELECT {samples_fields} FROM {table_variants} """
 9929            )
 9930
 9931            # Create vaf_stats column
 9932            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9933                lambda row: genotype_stats(
 9934                    row, samples=self.get_header_sample_list(), info=info
 9935                ),
 9936                axis=1,
 9937            )
 9938
 9939            # List of vcf tags
 9940            sql_vaf_stats_fields = []
 9941
 9942            # Check all VAF stats infos
 9943            for stat in vcf_infos_tags:
 9944
 9945                # Extract stats
 9946                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9947                    lambda x: dict(x).get(stat, "")
 9948                )
 9949
 9950                # Add snpeff_hgvs to header
 9951                vcf_reader.infos[stat] = vcf.parser._Info(
 9952                    stat,
 9953                    ".",
 9954                    "String",
 9955                    vcf_infos_tags.get(stat, "genotype statistics"),
 9956                    "howard calculation",
 9957                    "0",
 9958                    self.code_type_map.get("String"),
 9959                )
 9960
 9961                if len(sql_vaf_stats_fields):
 9962                    sep = ";"
 9963                else:
 9964                    sep = ""
 9965
 9966                # Create fields to add in INFO
 9967                sql_vaf_stats_fields.append(
 9968                    f"""
 9969                        CASE
 9970                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9971                            THEN concat(
 9972                                    '{sep}{stat}=',
 9973                                    dataframe_vaf_stats."{stat}"
 9974                                )
 9975                            ELSE ''
 9976                        END
 9977                    """
 9978                )
 9979
 9980            # SQL set for update
 9981            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9982
 9983            # Update
 9984            sql_update = f"""
 9985                UPDATE {table_variants}
 9986                SET "INFO" = 
 9987                    concat(
 9988                        CASE
 9989                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9990                            THEN ''
 9991                            ELSE concat("INFO", ';')
 9992                        END,
 9993                        {sql_vaf_stats_fields_set}
 9994                    )
 9995                FROM dataframe_vaf_stats
 9996                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9997
 9998            """
 9999            self.conn.execute(sql_update)
10000
10001            # Remove added columns
10002            for added_column in added_columns:
10003                self.drop_column(column=added_column)
10004
10005            # Delete dataframe
10006            del dataframe_vaf_stats
10007            gc.collect()
10008
10009    def calculation_transcripts_annotation(
10010        self, info_json: str = None, info_format: str = None
10011    ) -> None:
10012        """
10013        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10014        field to it if transcripts are available.
10015
10016        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10017        is a string parameter that represents the information field to be used in the transcripts JSON.
10018        It is used to specify the JSON format for the transcripts information. If no value is provided
10019        when calling the method, it defaults to "
10020        :type info_json: str
10021        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10022        method is a string parameter that specifies the format of the information field to be used in
10023        the transcripts JSON. It is used to define the format of the information field
10024        :type info_format: str
10025        """
10026
10027        # Create transcripts table
10028        transcripts_table = self.create_transcript_view()
10029
10030        # Add info field
10031        if transcripts_table:
10032            self.transcript_view_to_variants(
10033                transcripts_table=transcripts_table,
10034                transcripts_info_field_json=info_json,
10035                transcripts_info_field_format=info_format,
10036            )
10037        else:
10038            log.info("No Transcripts to process. Check param.json file configuration")
10039
10040    def calculation_transcripts_prioritization(self) -> None:
10041        """
10042        The function `calculation_transcripts_prioritization` creates a transcripts table and
10043        prioritizes transcripts based on certain criteria.
10044        """
10045
10046        # Create transcripts table
10047        transcripts_table = self.create_transcript_view()
10048
10049        # Add info field
10050        if transcripts_table:
10051            self.transcripts_prioritization(transcripts_table=transcripts_table)
10052        else:
10053            log.info("No Transcripts to process. Check param.json file configuration")
10054
10055    def calculation_transcripts_export(self) -> None:
10056        """ """
10057
10058        # Create transcripts table
10059        transcripts_table = self.create_transcript_view()
10060
10061        # Add info field
10062        if transcripts_table:
10063            self.transcripts_export(transcripts_table=transcripts_table)
10064        else:
10065            log.info("No Transcripts to process. Check param.json file configuration")
10066
10067    ###############
10068    # Transcripts #
10069    ###############
10070
10071    def transcripts_export(
10072        self, transcripts_table: str = None, param: dict = {}
10073    ) -> bool:
10074        """ """
10075
10076        log.debug("Start transcripts export...")
10077
10078        # Param
10079        if not param:
10080            param = self.get_param()
10081
10082        # Param export
10083        param_transcript_export = param.get("transcripts", {}).get("export", {})
10084
10085        # Output file
10086        transcripts_export_output = param_transcript_export.get("output", None)
10087
10088        if not param_transcript_export or not transcripts_export_output:
10089            log.warning(f"No transcriipts export parameters defined!")
10090            return False
10091
10092        # List of transcripts annotations
10093        query_describe = f"""
10094            SELECT column_name
10095            FROM (
10096                    DESCRIBE SELECT * FROM {transcripts_table}
10097                )
10098            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10099        """
10100        transcripts_annotations_list = list(
10101            self.get_query_to_df(query=query_describe)["column_name"]
10102        )
10103
10104        # Create transcripts table for export
10105        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10106            random.choices(string.ascii_uppercase + string.digits, k=10)
10107        )
10108        query_create_transcripts_table_export = f"""
10109            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10110        """
10111        self.execute_query(query=query_create_transcripts_table_export)
10112
10113        # Output file format
10114        transcripts_export_output_format = get_file_format(
10115            filename=transcripts_export_output
10116        )
10117
10118        # Format VCF - construct INFO
10119        if transcripts_export_output_format in ["vcf"]:
10120
10121            # Construct query update INFO and header
10122            query_update_info = []
10123            for field in transcripts_annotations_list:
10124
10125                # If field not in header
10126                if field not in self.get_header_infos_list():
10127
10128                    # Add PZ Transcript in header
10129                    self.get_header().infos[field] = vcf.parser._Info(
10130                        field,
10131                        ".",
10132                        "String",
10133                        f"Annotation '{field}' from transcript view",
10134                        "unknown",
10135                        "unknown",
10136                        0,
10137                    )
10138
10139                # Add field as INFO/tag
10140                query_update_info.append(
10141                    f"""
10142                        CASE
10143                            WHEN "{field}" IS NOT NULL
10144                            THEN concat('{field}=', "{field}", ';')    
10145                            ELSE ''     
10146                        END
10147                        """
10148                )
10149
10150            # Query param
10151            query_update_info_value = (
10152                f""" concat('',  {", ".join(query_update_info)}) """
10153            )
10154            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10155
10156        else:
10157
10158            # Query param
10159            query_update_info_value = f""" NULL """
10160            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10161
10162        # Update query INFO column
10163        query_update = f"""
10164            UPDATE {transcripts_table_export}
10165            SET INFO = {query_update_info_value}
10166
10167        """
10168        self.execute_query(query=query_update)
10169
10170        # Export
10171        self.export_output(
10172            output_file=transcripts_export_output,
10173            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10174        )
10175
10176        # Drop transcripts export table
10177        query_drop_transcripts_table_export = f"""
10178            DROP TABLE {transcripts_table_export}
10179        """
10180        self.execute_query(query=query_drop_transcripts_table_export)
10181
10182    def transcripts_prioritization(
10183        self, transcripts_table: str = None, param: dict = {}
10184    ) -> bool:
10185        """
10186        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10187        and updates the variants table with the prioritized information.
10188
10189        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10190        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10191        This parameter is used to identify the table where the transcripts data is stored for the
10192        prioritization process
10193        :type transcripts_table: str
10194        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10195        that contains various configuration settings for the prioritization process of transcripts. It
10196        is used to customize the behavior of the prioritization algorithm and includes settings such as
10197        the prefix for prioritization fields, default profiles, and other
10198        :type param: dict
10199        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10200        transcripts prioritization process is successfully completed, and `False` if there are any
10201        issues or if no profile is defined for transcripts prioritization.
10202        """
10203
10204        log.debug("Start transcripts prioritization...")
10205
10206        # Param
10207        if not param:
10208            param = self.get_param()
10209
10210        # Variants table
10211        table_variants = self.get_table_variants()
10212
10213        # Transcripts table
10214        if transcripts_table is None:
10215            transcripts_table = self.create_transcript_view(
10216                transcripts_table="transcripts", param=param
10217            )
10218        if transcripts_table is None:
10219            msg_err = "No Transcripts table availalble"
10220            log.error(msg_err)
10221            raise ValueError(msg_err)
10222        log.debug(f"transcripts_table={transcripts_table}")
10223
10224        # Get transcripts columns
10225        columns_as_list_query = f"""
10226            DESCRIBE {transcripts_table}
10227        """
10228        columns_as_list = list(
10229            self.get_query_to_df(columns_as_list_query)["column_name"]
10230        )
10231
10232        # Create INFO if not exists
10233        if "INFO" not in columns_as_list:
10234            query_add_info = f"""
10235                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10236            """
10237            self.execute_query(query_add_info)
10238
10239        # Prioritization param and Force only PZ Score and Flag
10240        pz_param = param.get("transcripts", {}).get("prioritization", {})
10241
10242        # PZ profile by default
10243        pz_profile_default = (
10244            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10245        )
10246
10247        # Exit if no profile
10248        if pz_profile_default is None:
10249            log.warning("No profile defined for transcripts prioritization")
10250            return False
10251
10252        # PZ fields
10253        pz_param_pzfields = {}
10254
10255        # PZ field transcripts
10256        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10257
10258        # Add PZ Transcript in header
10259        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10260            pz_fields_transcripts,
10261            ".",
10262            "String",
10263            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10264            "unknown",
10265            "unknown",
10266            code_type_map["String"],
10267        )
10268
10269        # Mandatory fields
10270        pz_mandatory_fields_list = [
10271            "Score",
10272            "Flag",
10273            "Tags",
10274            "Comment",
10275            "Infos",
10276            "Class",
10277        ]
10278        pz_mandatory_fields = []
10279        for pz_mandatory_field in pz_mandatory_fields_list:
10280            pz_mandatory_fields.append(
10281                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10282            )
10283
10284        # PZ fields in param
10285        for pz_field in pz_param.get("pzfields", []):
10286            if pz_field in pz_mandatory_fields_list:
10287                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10288                    pz_param.get("pzprefix", "PTZ") + pz_field
10289                )
10290            else:
10291                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10292                pz_param_pzfields[pz_field] = pz_field_new
10293
10294                # Add PZ Transcript in header
10295                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10296                    pz_field_new,
10297                    ".",
10298                    "String",
10299                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10300                    "unknown",
10301                    "unknown",
10302                    code_type_map["String"],
10303                )
10304
10305        # PZ fields param
10306        pz_param["pzfields"] = pz_mandatory_fields
10307
10308        # Prioritization
10309        prioritization_result = self.prioritization(
10310            table=transcripts_table,
10311            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10312        )
10313        if not prioritization_result:
10314            log.warning("Transcripts prioritization not processed")
10315            return False
10316
10317        # PZ fields sql query
10318        query_update_select_list = []
10319        query_update_concat_list = []
10320        query_update_order_list = []
10321        for pz_param_pzfield in set(
10322            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10323        ):
10324            query_update_select_list.append(f" {pz_param_pzfield}, ")
10325
10326        for pz_param_pzfield in pz_param_pzfields:
10327            query_update_concat_list.append(
10328                f"""
10329                    , CASE 
10330                        WHEN {pz_param_pzfield} IS NOT NULL
10331                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10332                        ELSE ''
10333                    END
10334                """
10335            )
10336
10337        # Order by
10338        pz_orders = (
10339            param.get("transcripts", {})
10340            .get("prioritization", {})
10341            .get("prioritization_transcripts_order", {})
10342        )
10343        if not pz_orders:
10344            pz_orders = {
10345                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10346                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10347            }
10348        for pz_order in pz_orders:
10349            query_update_order_list.append(
10350                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10351            )
10352
10353        # Fields to explode
10354        fields_to_explode = (
10355            list(pz_param_pzfields.keys())
10356            + pz_mandatory_fields
10357            + list(pz_orders.keys())
10358        )
10359        # Remove transcript column as a specific transcript column
10360        if "transcript" in fields_to_explode:
10361            fields_to_explode.remove("transcript")
10362
10363        # Fields intranscripts table
10364        query_transcripts_table = f"""
10365            DESCRIBE SELECT * FROM {transcripts_table}
10366        """
10367        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10368
10369        # Check fields to explode
10370        for field_to_explode in fields_to_explode:
10371            if field_to_explode not in self.get_header_infos_list() + list(
10372                query_transcripts_table.column_name
10373            ):
10374                msg_err = f"INFO/{field_to_explode} NOT IN header"
10375                log.error(msg_err)
10376                raise ValueError(msg_err)
10377
10378        # Explode fields to explode
10379        self.explode_infos(
10380            table=transcripts_table,
10381            fields=fields_to_explode,
10382        )
10383
10384        # Transcript preference file
10385        transcripts_preference_file = (
10386            param.get("transcripts", {})
10387            .get("prioritization", {})
10388            .get("prioritization_transcripts", {})
10389        )
10390        transcripts_preference_file = full_path(transcripts_preference_file)
10391
10392        # Transcript preference forced
10393        transcript_preference_force = (
10394            param.get("transcripts", {})
10395            .get("prioritization", {})
10396            .get("prioritization_transcripts_force", False)
10397        )
10398        # Transcript version forced
10399        transcript_version_force = (
10400            param.get("transcripts", {})
10401            .get("prioritization", {})
10402            .get("prioritization_transcripts_version_force", False)
10403        )
10404
10405        # Transcripts Ranking
10406        if transcripts_preference_file:
10407
10408            # Transcripts file to dataframe
10409            if os.path.exists(transcripts_preference_file):
10410                transcripts_preference_dataframe = transcripts_file_to_df(
10411                    transcripts_preference_file
10412                )
10413            else:
10414                log.error(
10415                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10416                )
10417                raise ValueError(
10418                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10419                )
10420
10421            # Order by depending to transcript preference forcing
10422            if transcript_preference_force:
10423                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10424            else:
10425                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10426
10427            # Transcript columns joined depend on version consideration
10428            if transcript_version_force:
10429                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10430            else:
10431                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10432
10433            # Query ranking for update
10434            query_update_ranking = f"""
10435                SELECT
10436                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10437                    ROW_NUMBER() OVER (
10438                        PARTITION BY "#CHROM", POS, REF, ALT
10439                        ORDER BY {order_by}
10440                    ) AS rn
10441                FROM {transcripts_table}
10442                LEFT JOIN 
10443                    (
10444                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10445                        FROM transcripts_preference_dataframe
10446                    ) AS transcripts_preference
10447                ON {transcripts_version_join}
10448            """
10449
10450        else:
10451
10452            # Query ranking for update
10453            query_update_ranking = f"""
10454                SELECT
10455                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10456                    ROW_NUMBER() OVER (
10457                        PARTITION BY "#CHROM", POS, REF, ALT
10458                        ORDER BY {" , ".join(query_update_order_list)}
10459                    ) AS rn
10460                FROM {transcripts_table}
10461            """
10462
10463        # Export Transcripts prioritization infos to variants table
10464        query_update = f"""
10465            WITH RankedTranscripts AS (
10466                {query_update_ranking}
10467            )
10468            UPDATE {table_variants}
10469                SET
10470                INFO = CONCAT(CASE
10471                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10472                            THEN ''
10473                            ELSE concat("INFO", ';')
10474                        END,
10475                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10476                        )
10477            FROM
10478                RankedTranscripts
10479            WHERE
10480                rn = 1
10481                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10482                AND variants."POS" = RankedTranscripts."POS"
10483                AND variants."REF" = RankedTranscripts."REF"
10484                AND variants."ALT" = RankedTranscripts."ALT"     
10485        """
10486
10487        # log.debug(f"query_update={query_update}")
10488        self.execute_query(query=query_update)
10489
10490        # Return
10491        return True
10492
10493    def create_transcript_view_from_columns_map(
10494        self,
10495        transcripts_table: str = "transcripts",
10496        columns_maps: dict = {},
10497        added_columns: list = [],
10498        temporary_tables: list = None,
10499        annotation_fields: list = None,
10500        column_rename: dict = {},
10501        column_clean: bool = False,
10502        column_case: str = None,
10503    ) -> tuple[list, list, list]:
10504        """
10505        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10506        specified columns mapping for transcripts data.
10507
10508        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10509        of the table where the transcripts data is stored or will be stored in the database. This table
10510        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10511        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10512        :type transcripts_table: str (optional)
10513        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10514        about how to map columns from a transcripts table to create a view. Each entry in the
10515        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10516        typically includes details such as the main transcript column and additional information columns
10517        :type columns_maps: dict
10518        :param added_columns: The `added_columns` parameter in the
10519        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10520        that will be added to the view being created based on the columns map provided. These columns
10521        are generated by exploding the transcript information columns along with the main transcript
10522        column
10523        :type added_columns: list
10524        :param temporary_tables: The `temporary_tables` parameter in the
10525        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10526        tables created during the process of creating a transcript view from a columns map. These
10527        temporary tables are used to store intermediate results or transformations before the final view
10528        is generated
10529        :type temporary_tables: list
10530        :param annotation_fields: The `annotation_fields` parameter in the
10531        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10532        used for annotation in the query view creation process. These fields are extracted from the
10533        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10534        :type annotation_fields: list
10535        :param column_rename: The `column_rename` parameter in the
10536        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10537        custom renaming for columns during the creation of the temporary table view. This parameter
10538        provides a mapping of original column names to the desired renamed column names. By using this
10539        parameter,
10540        :type column_rename: dict
10541        :param column_clean: The `column_clean` parameter in the
10542        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10543        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10544        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10545        False
10546        :type column_clean: bool (optional)
10547        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10548        function is used to specify the case transformation to be applied to the columns during the view
10549        creation process. It allows you to control whether the column values should be converted to
10550        lowercase, uppercase, or remain unchanged
10551        :type column_case: str
10552        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10553        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10554        """
10555
10556        log.debug("Start transcrpts view creation from columns map...")
10557
10558        # "from_columns_map": [
10559        #     {
10560        #         "transcripts_column": "Ensembl_transcriptid",
10561        #         "transcripts_infos_columns": [
10562        #             "genename",
10563        #             "Ensembl_geneid",
10564        #             "LIST_S2_score",
10565        #             "LIST_S2_pred",
10566        #         ],
10567        #     },
10568        #     {
10569        #         "transcripts_column": "Ensembl_transcriptid",
10570        #         "transcripts_infos_columns": [
10571        #             "genename",
10572        #             "VARITY_R_score",
10573        #             "Aloft_pred",
10574        #         ],
10575        #     },
10576        # ],
10577
10578        # Init
10579        if temporary_tables is None:
10580            temporary_tables = []
10581        if annotation_fields is None:
10582            annotation_fields = []
10583
10584        # Variants table
10585        table_variants = self.get_table_variants()
10586
10587        for columns_map in columns_maps:
10588
10589            # Transcript column
10590            transcripts_column = columns_map.get("transcripts_column", None)
10591
10592            # Transcripts infos columns
10593            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10594
10595            # Transcripts infos columns rename
10596            column_rename = columns_map.get("column_rename", column_rename)
10597
10598            # Transcripts infos columns clean
10599            column_clean = columns_map.get("column_clean", column_clean)
10600
10601            # Transcripts infos columns case
10602            column_case = columns_map.get("column_case", column_case)
10603
10604            if transcripts_column is not None:
10605
10606                # Explode
10607                added_columns += self.explode_infos(
10608                    fields=[transcripts_column] + transcripts_infos_columns
10609                )
10610
10611                # View clauses
10612                clause_select_variants = []
10613                clause_select_tanscripts = []
10614                for field in [transcripts_column] + transcripts_infos_columns:
10615
10616                    # AS field
10617                    as_field = field
10618
10619                    # Rename
10620                    if column_rename:
10621                        as_field = column_rename.get(as_field, as_field)
10622
10623                    # Clean
10624                    if column_clean:
10625                        as_field = clean_annotation_field(as_field)
10626
10627                    # Case
10628                    if column_case:
10629                        if column_case.lower() in ["lower"]:
10630                            as_field = as_field.lower()
10631                        elif column_case.lower() in ["upper"]:
10632                            as_field = as_field.upper()
10633
10634                    # Clause select Variants
10635                    clause_select_variants.append(
10636                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10637                    )
10638
10639                    if field in [transcripts_column]:
10640                        clause_select_tanscripts.append(
10641                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10642                        )
10643                    else:
10644                        clause_select_tanscripts.append(
10645                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10646                        )
10647                        annotation_fields.append(as_field)
10648
10649                # Querey View
10650                query = f""" 
10651                    SELECT
10652                        "#CHROM", POS, REF, ALT, INFO,
10653                        "{transcripts_column}" AS 'transcript',
10654                        {", ".join(clause_select_tanscripts)}
10655                    FROM (
10656                        SELECT 
10657                            "#CHROM", POS, REF, ALT, INFO,
10658                            {", ".join(clause_select_variants)}
10659                        FROM {table_variants}
10660                        )
10661                    WHERE "{transcripts_column}" IS NOT NULL
10662                """
10663
10664                # Create temporary table
10665                temporary_table = transcripts_table + "".join(
10666                    random.choices(string.ascii_uppercase + string.digits, k=10)
10667                )
10668
10669                # Temporary_tables
10670                temporary_tables.append(temporary_table)
10671                query_view = f"""
10672                    CREATE TEMPORARY TABLE {temporary_table}
10673                    AS ({query})
10674                """
10675                self.execute_query(query=query_view)
10676
10677        return added_columns, temporary_tables, annotation_fields
10678
10679    def create_transcript_view_from_column_format(
10680        self,
10681        transcripts_table: str = "transcripts",
10682        column_formats: dict = {},
10683        temporary_tables: list = None,
10684        annotation_fields: list = None,
10685        column_rename: dict = {},
10686        column_clean: bool = False,
10687        column_case: str = None,
10688    ) -> tuple[list, list, list]:
10689        """
10690        The `create_transcript_view_from_column_format` function generates a transcript view based on
10691        specified column formats, adds additional columns and annotation fields, and returns the list of
10692        temporary tables and annotation fields.
10693
10694        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10695        of the table containing the transcripts data. This table will be used as the base table for
10696        creating the transcript view. The default value for this parameter is "transcripts", but you can
10697        provide a different table name if needed, defaults to transcripts
10698        :type transcripts_table: str (optional)
10699        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10700        about the columns to be used for creating the transcript view. Each entry in the dictionary
10701        specifies the mapping between a transcripts column and a transcripts infos column. This
10702        parameter allows you to define how the columns from the transcripts table should be transformed
10703        or mapped
10704        :type column_formats: dict
10705        :param temporary_tables: The `temporary_tables` parameter in the
10706        `create_transcript_view_from_column_format` function is a list that stores the names of
10707        temporary views created during the process of creating a transcript view from a column format.
10708        These temporary views are used to manipulate and extract data before generating the final
10709        transcript view
10710        :type temporary_tables: list
10711        :param annotation_fields: The `annotation_fields` parameter in the
10712        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10713        that are extracted from the temporary views created during the process. These annotation fields
10714        are obtained by querying the temporary views and extracting the column names excluding specific
10715        columns like `#CH
10716        :type annotation_fields: list
10717        :param column_rename: The `column_rename` parameter in the
10718        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10719        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10720        column names to new column names in this dictionary, you can rename specific columns during the
10721        process
10722        :type column_rename: dict
10723        :param column_clean: The `column_clean` parameter in the
10724        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10725        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10726        will be cleaned during the creation of the transcript view based on the specified column format,
10727        defaults to False
10728        :type column_clean: bool (optional)
10729        :param column_case: The `column_case` parameter in the
10730        `create_transcript_view_from_column_format` function is used to specify the case transformation
10731        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10732        to convert the column names to uppercase or lowercase, respectively
10733        :type column_case: str
10734        :return: The `create_transcript_view_from_column_format` function returns two lists:
10735        `temporary_tables` and `annotation_fields`.
10736        """
10737
10738        log.debug("Start transcrpts view creation from column format...")
10739
10740        #  "from_column_format": [
10741        #     {
10742        #         "transcripts_column": "ANN",
10743        #         "transcripts_infos_column": "Feature_ID",
10744        #     }
10745        # ],
10746
10747        # Init
10748        if temporary_tables is None:
10749            temporary_tables = []
10750        if annotation_fields is None:
10751            annotation_fields = []
10752
10753        for column_format in column_formats:
10754
10755            # annotation field and transcript annotation field
10756            annotation_field = column_format.get("transcripts_column", "ANN")
10757            transcript_annotation = column_format.get(
10758                "transcripts_infos_column", "Feature_ID"
10759            )
10760
10761            # Transcripts infos columns rename
10762            column_rename = column_format.get("column_rename", column_rename)
10763
10764            # Transcripts infos columns clean
10765            column_clean = column_format.get("column_clean", column_clean)
10766
10767            # Transcripts infos columns case
10768            column_case = column_format.get("column_case", column_case)
10769
10770            # Temporary View name
10771            temporary_view_name = transcripts_table + "".join(
10772                random.choices(string.ascii_uppercase + string.digits, k=10)
10773            )
10774
10775            # Create temporary view name
10776            temporary_view_name = self.annotation_format_to_table(
10777                uniquify=True,
10778                annotation_field=annotation_field,
10779                view_name=temporary_view_name,
10780                annotation_id=transcript_annotation,
10781                column_rename=column_rename,
10782                column_clean=column_clean,
10783                column_case=column_case,
10784            )
10785
10786            # Annotation fields
10787            if temporary_view_name:
10788                query_annotation_fields = f"""
10789                    SELECT *
10790                    FROM (
10791                        DESCRIBE SELECT *
10792                        FROM {temporary_view_name}
10793                        )
10794                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10795                """
10796                df_annotation_fields = self.get_query_to_df(
10797                    query=query_annotation_fields
10798                )
10799
10800                # Add temporary view and annotation fields
10801                temporary_tables.append(temporary_view_name)
10802                annotation_fields += list(set(df_annotation_fields["column_name"]))
10803
10804        return temporary_tables, annotation_fields
10805
10806    def create_transcript_view(
10807        self,
10808        transcripts_table: str = None,
10809        transcripts_table_drop: bool = False,
10810        param: dict = {},
10811    ) -> str:
10812        """
10813        The `create_transcript_view` function generates a transcript view by processing data from a
10814        specified table based on provided parameters and structural information.
10815
10816        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10817        is used to specify the name of the table that will store the final transcript view data. If a table
10818        name is not provided, the function will create a new table to store the transcript view data, and by
10819        default,, defaults to transcripts
10820        :type transcripts_table: str (optional)
10821        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10822        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10823        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10824        the function will drop the existing transcripts table if it exists, defaults to False
10825        :type transcripts_table_drop: bool (optional)
10826        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10827        contains information needed to create a transcript view. It includes details such as the structure
10828        of the transcripts, columns mapping, column formats, and other necessary information for generating
10829        the view. This parameter allows for flexibility and customization
10830        :type param: dict
10831        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10832        created or modified during the execution of the function.
10833        """
10834
10835        log.debug("Start transcripts view creation...")
10836
10837        # Default
10838        transcripts_table_default = "transcripts"
10839
10840        # Param
10841        if not param:
10842            param = self.get_param()
10843
10844        # Struct
10845        struct = param.get("transcripts", {}).get("struct", None)
10846
10847        # Transcript veresion
10848        transcript_id_remove_version = param.get("transcripts", {}).get(
10849            "transcript_id_remove_version", False
10850        )
10851
10852        # Transcripts mapping
10853        transcript_id_mapping_file = param.get("transcripts", {}).get(
10854            "transcript_id_mapping_file", None
10855        )
10856
10857        # Transcripts mapping
10858        transcript_id_mapping_force = param.get("transcripts", {}).get(
10859            "transcript_id_mapping_force", None
10860        )
10861
10862        if struct:
10863
10864            # Transcripts table
10865            if transcripts_table is None:
10866                transcripts_table = param.get("transcripts", {}).get(
10867                    "table", transcripts_table_default
10868                )
10869
10870            # added_columns
10871            added_columns = []
10872
10873            # Temporary tables
10874            temporary_tables = []
10875
10876            # Annotation fields
10877            annotation_fields = []
10878
10879            # from columns map
10880            columns_maps = struct.get("from_columns_map", [])
10881            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10882                self.create_transcript_view_from_columns_map(
10883                    transcripts_table=transcripts_table,
10884                    columns_maps=columns_maps,
10885                    added_columns=added_columns,
10886                    temporary_tables=temporary_tables,
10887                    annotation_fields=annotation_fields,
10888                )
10889            )
10890            added_columns += added_columns_tmp
10891            temporary_tables += temporary_tables_tmp
10892            annotation_fields += annotation_fields_tmp
10893
10894            # from column format
10895            column_formats = struct.get("from_column_format", [])
10896            temporary_tables_tmp, annotation_fields_tmp = (
10897                self.create_transcript_view_from_column_format(
10898                    transcripts_table=transcripts_table,
10899                    column_formats=column_formats,
10900                    temporary_tables=temporary_tables,
10901                    annotation_fields=annotation_fields,
10902                )
10903            )
10904            temporary_tables += temporary_tables_tmp
10905            annotation_fields += annotation_fields_tmp
10906
10907            # Remove some specific fields/column
10908            annotation_fields = list(set(annotation_fields))
10909            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10910                if field in annotation_fields:
10911                    annotation_fields.remove(field)
10912
10913            # Merge temporary tables query
10914            query_merge = ""
10915            for temporary_table in list(set(temporary_tables)):
10916
10917                # First temporary table
10918                if not query_merge:
10919                    query_merge = f"""
10920                        SELECT * FROM {temporary_table}
10921                    """
10922                # other temporary table (using UNION)
10923                else:
10924                    query_merge += f"""
10925                        UNION BY NAME SELECT * FROM {temporary_table}
10926                    """
10927
10928            # transcript table tmp
10929            transcript_table_tmp = "transcripts_tmp"
10930            transcript_table_tmp2 = "transcripts_tmp2"
10931            transcript_table_tmp3 = "transcripts_tmp3"
10932
10933            # Merge on transcript
10934            query_merge_on_transcripts_annotation_fields = []
10935
10936            # Add transcript list
10937            query_merge_on_transcripts_annotation_fields.append(
10938                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10939            )
10940
10941            # Aggregate all annotations fields
10942            for annotation_field in set(annotation_fields):
10943                query_merge_on_transcripts_annotation_fields.append(
10944                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10945                )
10946
10947            # Transcripts mapping
10948            if transcript_id_mapping_file:
10949
10950                # Transcript dataframe
10951                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10952                transcript_id_mapping_dataframe = transcripts_file_to_df(
10953                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10954                )
10955
10956                # Transcript version remove
10957                if transcript_id_remove_version:
10958                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10959                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10960                    query_left_join = f"""
10961                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10962                    """
10963                else:
10964                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10965                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10966                    query_left_join = f"""
10967                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10968                    """
10969
10970                # Transcript column for group by merge
10971                query_transcript_merge_group_by = """
10972                        CASE
10973                            WHEN transcript_mapped NOT IN ('')
10974                            THEN split_part(transcript_mapped, '.', 1)
10975                            ELSE split_part(transcript_original, '.', 1)
10976                        END
10977                    """
10978
10979                # Merge query
10980                transcripts_tmp2_query = f"""
10981                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10982                    FROM ({query_merge}) AS {transcript_table_tmp}
10983                    {query_left_join}
10984                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10985                """
10986
10987                # Retrive columns after mege
10988                transcripts_tmp2_describe_query = f"""
10989                    DESCRIBE {transcripts_tmp2_query}
10990                """
10991                transcripts_tmp2_describe_list = list(
10992                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10993                        "column_name"
10994                    ]
10995                )
10996
10997                # Create list of columns for select clause
10998                transcripts_tmp2_describe_select_clause = []
10999                for field in transcripts_tmp2_describe_list:
11000                    if field not in [
11001                        "#CHROM",
11002                        "POS",
11003                        "REF",
11004                        "ALT",
11005                        "INFO",
11006                        "transcript_mapped",
11007                    ]:
11008                        as_field = field
11009                        if field in ["transcript_original"]:
11010                            as_field = "transcripts_mapped"
11011                        transcripts_tmp2_describe_select_clause.append(
11012                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11013                        )
11014
11015                # Merge with mapping
11016                query_merge_on_transcripts = f"""
11017                    SELECT
11018                        "#CHROM", POS, REF, ALT, INFO,
11019                        CASE
11020                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11021                            THEN ANY_VALUE(transcript_mapped)
11022                            ELSE ANY_VALUE(transcript_original)
11023                        END AS transcript,
11024                        {", ".join(transcripts_tmp2_describe_select_clause)}
11025                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11026                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11027                        {query_transcript_merge_group_by}
11028                """
11029
11030                # Add transcript filter from mapping file
11031                if transcript_id_mapping_force:
11032                    query_merge_on_transcripts = f"""
11033                        SELECT *
11034                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11035                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11036                    """
11037
11038            # No transcript mapping
11039            else:
11040
11041                # Remove transcript version
11042                if transcript_id_remove_version:
11043                    query_transcript_column = f"""
11044                        split_part({transcript_table_tmp}.transcript, '.', 1)
11045                    """
11046                else:
11047                    query_transcript_column = """
11048                        transcript
11049                    """
11050
11051                # Query sections
11052                query_transcript_column_select = (
11053                    f"{query_transcript_column} AS transcript"
11054                )
11055                query_transcript_column_group_by = query_transcript_column
11056
11057                # Query for transcripts view
11058                query_merge_on_transcripts = f"""
11059                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11060                    FROM ({query_merge}) AS {transcript_table_tmp}
11061                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11062                """
11063
11064            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11065
11066            # Drop transcript view is necessary
11067            if transcripts_table_drop:
11068                query_drop = f"""
11069                    DROP TABLE IF EXISTS {transcripts_table};
11070                """
11071                self.execute_query(query=query_drop)
11072
11073            # Merge and create transcript view
11074            query_create_view = f"""
11075                CREATE TABLE IF NOT EXISTS {transcripts_table}
11076                AS {query_merge_on_transcripts}
11077            """
11078            self.execute_query(query=query_create_view)
11079
11080            # Remove added columns
11081            for added_column in added_columns:
11082                self.drop_column(column=added_column)
11083
11084        else:
11085
11086            transcripts_table = None
11087
11088        return transcripts_table
11089
11090    def annotation_format_to_table(
11091        self,
11092        uniquify: bool = True,
11093        annotation_field: str = "ANN",
11094        annotation_id: str = "Feature_ID",
11095        view_name: str = "transcripts",
11096        column_rename: dict = {},
11097        column_clean: bool = False,
11098        column_case: str = None,
11099    ) -> str:
11100        """
11101        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11102        structured table format, ensuring unique values and creating a temporary table for further
11103        processing or analysis.
11104
11105        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11106        unique values in the output or not. If set to `True`, the function will make sure that the
11107        output values are unique, defaults to True
11108        :type uniquify: bool (optional)
11109        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11110        that contains the annotation information for each variant. This field is used to extract the
11111        annotation details for further processing in the function. By default, it is set to "ANN",
11112        defaults to ANN
11113        :type annotation_field: str (optional)
11114        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11115        is used to specify the identifier for the annotation feature. This identifier will be used as a
11116        column name in the resulting table or view that is created based on the annotation data. It
11117        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11118        :type annotation_id: str (optional)
11119        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11120        to specify the name of the temporary table that will be created to store the transformed
11121        annotation data. This table will hold the extracted information from the annotation field in a
11122        structured format for further processing or analysis. By default,, defaults to transcripts
11123        :type view_name: str (optional)
11124        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11125        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11126        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11127        created based on the annotation data. This feature enables
11128        :type column_rename: dict
11129        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11130        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11131        If set to `True`, the function will clean the annotation field before further processing. This
11132        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11133        to False
11134        :type column_clean: bool (optional)
11135        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11136        used to specify the case transformation to be applied to the column names extracted from the
11137        annotation data. It allows you to set the case of the column names to either lowercase or
11138        uppercase for consistency or other specific requirements during the conversion
11139        :type column_case: str
11140        :return: The function `annotation_format_to_table` is returning the name of the view created,
11141        which is stored in the variable `view_name`.
11142        """
11143
11144        # Annotation field
11145        annotation_format = "annotation_explode"
11146
11147        # Transcript annotation
11148        if column_rename:
11149            annotation_id = column_rename.get(annotation_id, annotation_id)
11150
11151        if column_clean:
11152            annotation_id = clean_annotation_field(annotation_id)
11153
11154        # Prefix
11155        prefix = self.get_explode_infos_prefix()
11156        if prefix:
11157            prefix = "INFO/"
11158
11159        # Annotation fields
11160        annotation_infos = prefix + annotation_field
11161        annotation_format_infos = prefix + annotation_format
11162
11163        # Variants table
11164        table_variants = self.get_table_variants()
11165
11166        # Header
11167        vcf_reader = self.get_header()
11168
11169        # Add columns
11170        added_columns = []
11171
11172        # Explode HGVS field in column
11173        added_columns += self.explode_infos(fields=[annotation_field])
11174
11175        if annotation_field in vcf_reader.infos:
11176
11177            # Extract ANN header
11178            ann_description = vcf_reader.infos[annotation_field].desc
11179            pattern = r"'(.+?)'"
11180            match = re.search(pattern, ann_description)
11181            if match:
11182                ann_header_match = match.group(1).split(" | ")
11183                ann_header = []
11184                ann_header_desc = {}
11185                for i in range(len(ann_header_match)):
11186                    ann_header_info = "".join(
11187                        char for char in ann_header_match[i] if char.isalnum()
11188                    )
11189                    ann_header.append(ann_header_info)
11190                    ann_header_desc[ann_header_info] = ann_header_match[i]
11191                if not ann_header_desc:
11192                    raise ValueError("Invalid header description format")
11193            else:
11194                raise ValueError("Invalid header description format")
11195
11196            # Create variant id
11197            variant_id_column = self.get_variant_id_column()
11198            added_columns += [variant_id_column]
11199
11200            # Create dataframe
11201            dataframe_annotation_format = self.get_query_to_df(
11202                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11203            )
11204
11205            # Create annotation columns
11206            dataframe_annotation_format[
11207                annotation_format_infos
11208            ] = dataframe_annotation_format[annotation_infos].apply(
11209                lambda x: explode_annotation_format(
11210                    annotation=str(x),
11211                    uniquify=uniquify,
11212                    output_format="JSON",
11213                    prefix="",
11214                    header=list(ann_header_desc.values()),
11215                )
11216            )
11217
11218            # Find keys
11219            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11220            df_keys = self.get_query_to_df(query=query_json)
11221
11222            # Check keys
11223            query_json_key = []
11224            for _, row in df_keys.iterrows():
11225
11226                # Key
11227                key = row.iloc[0]
11228                key_clean = key
11229
11230                # key rename
11231                if column_rename:
11232                    key_clean = column_rename.get(key_clean, key_clean)
11233
11234                # key clean
11235                if column_clean:
11236                    key_clean = clean_annotation_field(key_clean)
11237
11238                # Key case
11239                if column_case:
11240                    if column_case.lower() in ["lower"]:
11241                        key_clean = key_clean.lower()
11242                    elif column_case.lower() in ["upper"]:
11243                        key_clean = key_clean.upper()
11244
11245                # Type
11246                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11247
11248                # Get DataFrame from query
11249                df_json_type = self.get_query_to_df(query=query_json_type)
11250
11251                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11252                with pd.option_context("future.no_silent_downcasting", True):
11253                    df_json_type.fillna(value="", inplace=True)
11254                    replace_dict = {None: np.nan, "": np.nan}
11255                    df_json_type.replace(replace_dict, inplace=True)
11256                    df_json_type.dropna(inplace=True)
11257
11258                # Detect column type
11259                column_type = detect_column_type(df_json_type[key_clean])
11260
11261                # Append
11262                query_json_key.append(
11263                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11264                )
11265
11266            # Create view
11267            query_view = f"""
11268                CREATE TEMPORARY TABLE {view_name}
11269                AS (
11270                    SELECT *, {annotation_id} AS 'transcript'
11271                    FROM (
11272                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11273                        FROM dataframe_annotation_format
11274                        )
11275                    );
11276            """
11277            self.execute_query(query=query_view)
11278
11279        else:
11280
11281            # Return None
11282            view_name = None
11283
11284        # Remove added columns
11285        for added_column in added_columns:
11286            self.drop_column(column=added_column)
11287
11288        return view_name
11289
11290    def transcript_view_to_variants(
11291        self,
11292        transcripts_table: str = None,
11293        transcripts_column_id: str = None,
11294        transcripts_info_json: str = None,
11295        transcripts_info_field_json: str = None,
11296        transcripts_info_format: str = None,
11297        transcripts_info_field_format: str = None,
11298        param: dict = {},
11299    ) -> bool:
11300        """
11301        The `transcript_view_to_variants` function updates a variants table with information from
11302        transcripts in JSON format.
11303
11304        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11305        table containing the transcripts data. If this parameter is not provided, the function will
11306        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11307        :type transcripts_table: str
11308        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11309        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11310        identifier is used to match transcripts with variants in the database
11311        :type transcripts_column_id: str
11312        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11313        of the column in the variants table where the transcripts information will be stored in JSON
11314        format. This parameter allows you to define the column in the variants table that will hold the
11315        JSON-formatted information about transcripts
11316        :type transcripts_info_json: str
11317        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11318        specify the field in the VCF header that will contain information about transcripts in JSON
11319        format. This field will be added to the VCF header as an INFO field with the specified name
11320        :type transcripts_info_field_json: str
11321        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11322        format of the information about transcripts that will be stored in the variants table. This
11323        format can be used to define how the transcript information will be structured or displayed
11324        within the variants table
11325        :type transcripts_info_format: str
11326        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11327        specify the field in the VCF header that will contain information about transcripts in a
11328        specific format. This field will be added to the VCF header as an INFO field with the specified
11329        name
11330        :type transcripts_info_field_format: str
11331        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11332        that contains various configuration settings related to transcripts. It is used to provide
11333        default values for certain parameters if they are not explicitly provided when calling the
11334        method. The `param` dictionary can be passed as an argument
11335        :type param: dict
11336        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11337        if the operation is successful and `False` if certain conditions are not met.
11338        """
11339
11340        msg_info_prefix = "Start transcripts view to variants annotations"
11341
11342        log.debug(f"{msg_info_prefix}...")
11343
11344        # Default
11345        transcripts_table_default = "transcripts"
11346        transcripts_column_id_default = "transcript"
11347        transcripts_info_json_default = None
11348        transcripts_info_format_default = None
11349        transcripts_info_field_json_default = None
11350        transcripts_info_field_format_default = None
11351
11352        # Param
11353        if not param:
11354            param = self.get_param()
11355
11356        # Transcripts table
11357        if transcripts_table is None:
11358            transcripts_table = param.get("transcripts", {}).get(
11359                "table", transcripts_table_default
11360            )
11361
11362        # Transcripts column ID
11363        if transcripts_column_id is None:
11364            transcripts_column_id = param.get("transcripts", {}).get(
11365                "column_id", transcripts_column_id_default
11366            )
11367
11368        # Transcripts info json
11369        if transcripts_info_json is None:
11370            transcripts_info_json = param.get("transcripts", {}).get(
11371                "transcripts_info_json", transcripts_info_json_default
11372            )
11373
11374        # Transcripts info field JSON
11375        if transcripts_info_field_json is None:
11376            transcripts_info_field_json = param.get("transcripts", {}).get(
11377                "transcripts_info_field_json", transcripts_info_field_json_default
11378            )
11379        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11380        #     transcripts_info_json = transcripts_info_field_json
11381
11382        # Transcripts info format
11383        if transcripts_info_format is None:
11384            transcripts_info_format = param.get("transcripts", {}).get(
11385                "transcripts_info_format", transcripts_info_format_default
11386            )
11387
11388        # Transcripts info field FORMAT
11389        if transcripts_info_field_format is None:
11390            transcripts_info_field_format = param.get("transcripts", {}).get(
11391                "transcripts_info_field_format", transcripts_info_field_format_default
11392            )
11393        # if (
11394        #     transcripts_info_field_format is not None
11395        #     and transcripts_info_format is None
11396        # ):
11397        #     transcripts_info_format = transcripts_info_field_format
11398
11399        # Variants table
11400        table_variants = self.get_table_variants()
11401
11402        # Check info columns param
11403        if (
11404            transcripts_info_json is None
11405            and transcripts_info_field_json is None
11406            and transcripts_info_format is None
11407            and transcripts_info_field_format is None
11408        ):
11409            return False
11410
11411        # Transcripts infos columns
11412        query_transcripts_infos_columns = f"""
11413            SELECT *
11414            FROM (
11415                DESCRIBE SELECT * FROM {transcripts_table}
11416                )
11417            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11418        """
11419        transcripts_infos_columns = list(
11420            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11421        )
11422
11423        # View results
11424        clause_select = []
11425        clause_to_json = []
11426        clause_to_format = []
11427        for field in transcripts_infos_columns:
11428            # Do not consider INFO field for export into fields
11429            if field not in ["INFO"]:
11430                clause_select.append(
11431                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11432                )
11433                clause_to_json.append(f""" '{field}': "{field}" """)
11434                clause_to_format.append(f""" "{field}" """)
11435
11436        # Update
11437        update_set_json = []
11438        update_set_format = []
11439
11440        # VCF header
11441        vcf_reader = self.get_header()
11442
11443        # Transcripts to info column in JSON
11444        if transcripts_info_json:
11445
11446            # Create column on variants table
11447            self.add_column(
11448                table_name=table_variants,
11449                column_name=transcripts_info_json,
11450                column_type="JSON",
11451                default_value=None,
11452                drop=False,
11453            )
11454
11455            # Add header
11456            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11457                transcripts_info_json,
11458                ".",
11459                "String",
11460                "Transcripts in JSON format",
11461                "unknwon",
11462                "unknwon",
11463                self.code_type_map["String"],
11464            )
11465
11466            # Add to update
11467            update_set_json.append(
11468                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11469            )
11470
11471        # Transcripts to info field in JSON
11472        if transcripts_info_field_json:
11473
11474            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11475
11476            # Add to update
11477            update_set_json.append(
11478                f""" 
11479                    INFO = concat(
11480                            CASE
11481                                WHEN INFO NOT IN ('', '.')
11482                                THEN INFO
11483                                ELSE ''
11484                            END,
11485                            CASE
11486                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11487                                THEN concat(
11488                                    ';{transcripts_info_field_json}=',
11489                                    t.{transcripts_info_json}
11490                                )
11491                                ELSE ''
11492                            END
11493                            )
11494                """
11495            )
11496
11497            # Add header
11498            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11499                transcripts_info_field_json,
11500                ".",
11501                "String",
11502                "Transcripts in JSON format",
11503                "unknwon",
11504                "unknwon",
11505                self.code_type_map["String"],
11506            )
11507
11508        if update_set_json:
11509
11510            # Update query
11511            query_update = f"""
11512                UPDATE {table_variants}
11513                    SET {", ".join(update_set_json)}
11514                FROM
11515                (
11516                    SELECT
11517                        "#CHROM", POS, REF, ALT,
11518                            concat(
11519                            '{{',
11520                            string_agg(
11521                                '"' || "{transcripts_column_id}" || '":' ||
11522                                to_json(json_output)
11523                            ),
11524                            '}}'
11525                            )::JSON AS {transcripts_info_json}
11526                    FROM
11527                        (
11528                        SELECT
11529                            "#CHROM", POS, REF, ALT,
11530                            "{transcripts_column_id}",
11531                            to_json(
11532                                {{{",".join(clause_to_json)}}}
11533                            )::JSON AS json_output
11534                        FROM
11535                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11536                        WHERE "{transcripts_column_id}" IS NOT NULL
11537                        )
11538                    GROUP BY "#CHROM", POS, REF, ALT
11539                ) AS t
11540                WHERE {table_variants}."#CHROM" = t."#CHROM"
11541                    AND {table_variants}."POS" = t."POS"
11542                    AND {table_variants}."REF" = t."REF"
11543                    AND {table_variants}."ALT" = t."ALT"
11544            """
11545
11546            self.execute_query(query=query_update)
11547
11548        # Transcripts to info column in FORMAT
11549        if transcripts_info_format:
11550
11551            # Create column on variants table
11552            self.add_column(
11553                table_name=table_variants,
11554                column_name=transcripts_info_format,
11555                column_type="VARCHAR",
11556                default_value=None,
11557                drop=False,
11558            )
11559
11560            # Add header
11561            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11562                transcripts_info_format,
11563                ".",
11564                "String",
11565                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11566                "unknwon",
11567                "unknwon",
11568                self.code_type_map["String"],
11569            )
11570
11571            # Add to update
11572            update_set_format.append(
11573                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11574            )
11575
11576        else:
11577
11578            # Set variable for internal queries
11579            transcripts_info_format = "transcripts_info_format"
11580
11581        # Transcripts to info field in JSON
11582        if transcripts_info_field_format:
11583
11584            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11585
11586            # Add to update
11587            update_set_format.append(
11588                f""" 
11589                    INFO = concat(
11590                            CASE
11591                                WHEN INFO NOT IN ('', '.')
11592                                THEN INFO
11593                                ELSE ''
11594                            END,
11595                            CASE
11596                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11597                                THEN concat(
11598                                    ';{transcripts_info_field_format}=',
11599                                    t.{transcripts_info_format}
11600                                )
11601                                ELSE ''
11602                            END
11603                            )
11604                """
11605            )
11606
11607            # Add header
11608            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11609                transcripts_info_field_format,
11610                ".",
11611                "String",
11612                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11613                "unknwon",
11614                "unknwon",
11615                self.code_type_map["String"],
11616            )
11617
11618        if update_set_format:
11619
11620            # Update query
11621            query_update = f"""
11622                UPDATE {table_variants}
11623                    SET {", ".join(update_set_format)}
11624                FROM
11625                (
11626                    SELECT
11627                        "#CHROM", POS, REF, ALT,
11628                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11629                    FROM 
11630                        (
11631                        SELECT
11632                            "#CHROM", POS, REF, ALT,
11633                            "{transcripts_column_id}",
11634                            concat(
11635                                "{transcripts_column_id}",
11636                                '|',
11637                                {", '|', ".join(clause_to_format)}
11638                            ) AS {transcripts_info_format}
11639                        FROM
11640                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11641                        )
11642                    GROUP BY "#CHROM", POS, REF, ALT
11643                ) AS t
11644                WHERE {table_variants}."#CHROM" = t."#CHROM"
11645                    AND {table_variants}."POS" = t."POS"
11646                    AND {table_variants}."REF" = t."REF"
11647                    AND {table_variants}."ALT" = t."ALT"
11648            """
11649
11650            self.execute_query(query=query_update)
11651
11652        return True
11653
11654    def rename_info_fields(
11655        self, fields_to_rename: dict = None, table: str = None
11656    ) -> dict:
11657        """
11658        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11659        corresponding INFO fields in the variants table.
11660
11661        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11662        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11663        represent the original field names that need to be renamed, and the corresponding values
11664        represent the new names to which the fields should be
11665        :type fields_to_rename: dict
11666        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11667        the table in which the variants data is stored. This table contains information about genetic
11668        variants, and the function updates the corresponding INFO fields in this table when renaming
11669        specified fields in the VCF file header
11670        :type table: str
11671        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11672        the original field names as keys and their corresponding new names (or None if the field was
11673        removed) as values after renaming or removing specified fields in a VCF file header and updating
11674        corresponding INFO fields in the variants table.
11675        """
11676
11677        # Init
11678        fields_renamed = {}
11679        config = self.get_config()
11680        access = config.get("access")
11681
11682        if table is None:
11683            table = self.get_table_variants()
11684
11685        # regexp replace fonction
11686        regex_replace_dict = {}
11687        regex_replace_nb = 0
11688        regex_replace_partition = 125
11689        regex_replace = "INFO"
11690
11691        if fields_to_rename is not None and access not in ["RO"]:
11692
11693            log.info("Rename or remove fields...")
11694
11695            # Header
11696            header = self.get_header()
11697
11698            for field_to_rename, field_renamed in fields_to_rename.items():
11699
11700                if field_to_rename in header.infos:
11701
11702                    # Rename header
11703                    if field_renamed is not None:
11704                        header.infos[field_renamed] = vcf.parser._Info(
11705                            field_renamed,
11706                            header.infos[field_to_rename].num,
11707                            header.infos[field_to_rename].type,
11708                            header.infos[field_to_rename].desc,
11709                            header.infos[field_to_rename].source,
11710                            header.infos[field_to_rename].version,
11711                            header.infos[field_to_rename].type_code,
11712                        )
11713                    del header.infos[field_to_rename]
11714
11715                    # Rename INFO patterns
11716                    field_pattern = rf'(^|;)({field_to_rename})($|;|=[^;]*)'
11717                    if field_renamed is not None:
11718                        field_renamed_pattern = rf'\1{field_renamed}\3'
11719                    else:
11720                        field_renamed_pattern = ''
11721
11722                    # regexp replace
11723                    regex_replace_nb += 1
11724                    regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition)
11725                    if (regex_replace_nb % regex_replace_partition) == 0:
11726                        regex_replace = "INFO"
11727                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11728                    regex_replace_dict[regex_replace_key] = regex_replace
11729
11730                    # Return
11731                    fields_renamed[field_to_rename] = field_renamed
11732
11733                    # Log
11734                    if field_renamed is not None:
11735                        log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'")
11736                    else:
11737                        log.info(f"Rename or remove fields - field '{field_to_rename}' removed")
11738
11739                else:
11740
11741                    log.warning(f"Rename or remove fields - field '{field_to_rename}' not in header")
11742
11743
11744            # Rename INFO
11745            for regex_replace_key, regex_replace  in regex_replace_dict.items():
11746                log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...")
11747                query = f"""
11748                    UPDATE {table}
11749                    SET
11750                        INFO = {regex_replace}
11751                """
11752                log.debug(f"query={query}")
11753                self.execute_query(query=query)
11754
11755        return fields_renamed
11756
11757    def calculation_rename_info_fields(
11758        self,
11759        fields_to_rename: dict = None,
11760        table: str = None,
11761        operation_name: str = "RENAME_INFO_FIELDS",
11762    ) -> None:
11763        """
11764        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11765        fields to rename and table if provided, and then calls another function to rename the fields.
11766
11767        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11768        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11769        the key and the new field name as the value
11770        :type fields_to_rename: dict
11771        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11772        specify the name of the table for which the fields are to be renamed. It is a string type
11773        parameter
11774        :type table: str
11775        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11776        method is a string that specifies the name of the operation being performed. In this context, it
11777        is used as a default value for the operation name if not explicitly provided when calling the
11778        function, defaults to RENAME_INFO_FIELDS
11779        :type operation_name: str (optional)
11780        """
11781
11782        # Param
11783        param = self.get_param()
11784
11785        # Get param fields to rename
11786        param_fields_to_rename = (
11787            param.get("calculation", {})
11788            .get("calculations", {})
11789            .get(operation_name, {})
11790            .get("fields_to_rename", None)
11791        )
11792
11793        # Get param table
11794        param_table = (
11795            param.get("calculation", {})
11796            .get("calculations", {})
11797            .get(operation_name, {})
11798            .get("table", None)
11799        )
11800
11801        # Init fields_to_rename
11802        if fields_to_rename is None:
11803            fields_to_rename = param_fields_to_rename
11804
11805        # Init table
11806        if table is None:
11807            table = param_table
11808
11809        renamed_fields = self.rename_info_fields(
11810            fields_to_rename=fields_to_rename, table=table
11811        )
11812
11813        log.debug(f"renamed_fields:{renamed_fields}")
class Variants:
   37class Variants:
   38
   39    def __init__(
   40        self,
   41        conn=None,
   42        input: str = None,
   43        output: str = None,
   44        config: dict = {},
   45        param: dict = {},
   46        load: bool = False,
   47    ) -> None:
   48        """
   49        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   50        header
   51
   52        :param conn: the connection to the database
   53        :param input: the input file
   54        :param output: the output file
   55        :param config: a dictionary containing the configuration of the model
   56        :param param: a dictionary containing the parameters of the model
   57        """
   58
   59        # Init variables
   60        self.init_variables()
   61
   62        # Input
   63        self.set_input(input)
   64
   65        # Config
   66        self.set_config(config)
   67
   68        # Param
   69        self.set_param(param)
   70
   71        # Output
   72        self.set_output(output)
   73
   74        # connexion
   75        self.set_connexion(conn)
   76
   77        # Header
   78        self.set_header()
   79
   80        # Samples
   81        self.set_samples()
   82
   83        # Load data
   84        if load:
   85            self.load_data()
   86
   87    def set_samples(self, samples: list = None) -> list:
   88        """
   89        The function `set_samples` sets the samples attribute of an object to a provided list or
   90        retrieves it from a parameter dictionary.
   91
   92        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   93        input and sets the `samples` attribute of the class to the provided list. If no samples are
   94        provided, it tries to get the samples from the class's parameters using the `get_param` method
   95        :type samples: list
   96        :return: The `samples` list is being returned.
   97        """
   98
   99        if not samples:
  100            samples = self.get_param().get("samples", {}).get("list", None)
  101
  102        self.samples = samples
  103
  104        return samples
  105
  106    def get_samples(self) -> list:
  107        """
  108        This function returns a list of samples.
  109        :return: The `get_samples` method is returning the `samples` attribute of the object.
  110        """
  111
  112        return self.samples
  113
  114    def get_samples_check(self) -> bool:
  115        """
  116        This function returns the value of the "check" key within the "samples" dictionary retrieved
  117        from the parameters.
  118        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  119        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  120        method. If the key "check" is not found, it will return `False`.
  121        """
  122
  123        return self.get_param().get("samples", {}).get("check", True)
  124
  125    def set_input(self, input: str = None) -> None:
  126        """
  127        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  128        attributes in the class accordingly.
  129
  130        :param input: The `set_input` method in the provided code snippet is used to set attributes
  131        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  132        :type input: str
  133        """
  134
  135        if input and not isinstance(input, str):
  136            try:
  137                self.input = input.name
  138            except:
  139                log.error(f"Input file '{input} in bad format")
  140                raise ValueError(f"Input file '{input} in bad format")
  141        else:
  142            self.input = input
  143
  144        # Input format
  145        if input:
  146            input_name, input_extension = os.path.splitext(self.input)
  147            self.input_name = input_name
  148            self.input_extension = input_extension
  149            self.input_format = self.input_extension.replace(".", "")
  150
  151    def set_config(self, config: dict) -> None:
  152        """
  153        The set_config function takes a config object and assigns it as the configuration object for the
  154        class.
  155
  156        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  157        contains configuration settings for the class. When you call the `set_config` function with a
  158        dictionary object as the argument, it will set that dictionary as the configuration object for
  159        the class
  160        :type config: dict
  161        """
  162
  163        self.config = config
  164
  165    def set_param(self, param: dict) -> None:
  166        """
  167        This function sets a parameter object for the class based on the input dictionary.
  168
  169        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  170        as the `param` attribute of the class instance
  171        :type param: dict
  172        """
  173
  174        self.param = param
  175
  176    def init_variables(self) -> None:
  177        """
  178        This function initializes the variables that will be used in the rest of the class
  179        """
  180
  181        self.prefix = "howard"
  182        self.table_variants = "variants"
  183        self.dataframe = None
  184
  185        self.comparison_map = {
  186            "gt": ">",
  187            "gte": ">=",
  188            "lt": "<",
  189            "lte": "<=",
  190            "equals": "=",
  191            "contains": "SIMILAR TO",
  192        }
  193
  194        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  195
  196        self.code_type_map_to_sql = {
  197            "Integer": "INTEGER",
  198            "String": "VARCHAR",
  199            "Float": "FLOAT",
  200            "Flag": "VARCHAR",
  201        }
  202
  203        self.index_additionnal_fields = []
  204
  205    def get_indexing(self) -> bool:
  206        """
  207        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  208        returns False.
  209        :return: The value of the indexing parameter.
  210        """
  211
  212        return self.get_param().get("indexing", False)
  213
  214    def get_connexion_config(self) -> dict:
  215        """
  216        The function `get_connexion_config` returns a dictionary containing the configuration for a
  217        connection, including the number of threads and memory limit.
  218        :return: a dictionary containing the configuration for the Connexion library.
  219        """
  220
  221        # config
  222        config = self.get_config()
  223
  224        # Connexion config
  225        connexion_config = {}
  226        threads = self.get_threads()
  227
  228        # Threads
  229        if threads:
  230            connexion_config["threads"] = threads
  231
  232        # Memory
  233        # if config.get("memory", None):
  234        #     connexion_config["memory_limit"] = config.get("memory")
  235        if self.get_memory():
  236            connexion_config["memory_limit"] = self.get_memory()
  237
  238        # Temporary directory
  239        if config.get("tmp", None):
  240            connexion_config["temp_directory"] = config.get("tmp")
  241
  242        # Access
  243        if config.get("access", None):
  244            access = config.get("access")
  245            if access in ["RO"]:
  246                access = "READ_ONLY"
  247            elif access in ["RW"]:
  248                access = "READ_WRITE"
  249            connexion_db = self.get_connexion_db()
  250            if connexion_db in ":memory:":
  251                access = "READ_WRITE"
  252            connexion_config["access_mode"] = access
  253
  254        return connexion_config
  255
  256    def get_duckdb_settings(self) -> dict:
  257        """
  258        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  259        string.
  260        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  261        """
  262
  263        # config
  264        config = self.get_config()
  265
  266        # duckdb settings
  267        duckdb_settings_dict = {}
  268        if config.get("duckdb_settings", None):
  269            duckdb_settings = config.get("duckdb_settings")
  270            duckdb_settings = full_path(duckdb_settings)
  271            # duckdb setting is a file
  272            if os.path.exists(duckdb_settings):
  273                with open(duckdb_settings) as json_file:
  274                    duckdb_settings_dict = yaml.safe_load(json_file)
  275            # duckdb settings is a string
  276            else:
  277                duckdb_settings_dict = json.loads(duckdb_settings)
  278
  279        return duckdb_settings_dict
  280
  281    def set_connexion_db(self) -> str:
  282        """
  283        The function `set_connexion_db` returns the appropriate database connection string based on the
  284        input format and connection type.
  285        :return: the value of the variable `connexion_db`.
  286        """
  287
  288        # Default connexion db
  289        default_connexion_db = ":memory:"
  290
  291        # Find connexion db
  292        if self.get_input_format() in ["db", "duckdb"]:
  293            connexion_db = self.get_input()
  294        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  295            connexion_db = default_connexion_db
  296        elif self.get_connexion_type() in ["tmpfile"]:
  297            tmp_name = tempfile.mkdtemp(
  298                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  299            )
  300            connexion_db = f"{tmp_name}/tmp.db"
  301        elif self.get_connexion_type() != "":
  302            connexion_db = self.get_connexion_type()
  303        else:
  304            connexion_db = default_connexion_db
  305
  306        # Set connexion db
  307        self.connexion_db = connexion_db
  308
  309        return connexion_db
  310
  311    def set_connexion(self, conn) -> None:
  312        """
  313        The function `set_connexion` creates a connection to a database, with options for different
  314        database formats and settings.
  315
  316        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  317        database. If a connection is not provided, a new connection to an in-memory database is created.
  318        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  319        sqlite
  320        """
  321
  322        # Connexion db
  323        connexion_db = self.set_connexion_db()
  324
  325        # Connexion config
  326        connexion_config = self.get_connexion_config()
  327
  328        # Connexion format
  329        connexion_format = self.get_config().get("connexion_format", "duckdb")
  330        # Set connexion format
  331        self.connexion_format = connexion_format
  332
  333        # Connexion
  334        if not conn:
  335            if connexion_format in ["duckdb"]:
  336                conn = duckdb.connect(connexion_db, config=connexion_config)
  337                # duckDB settings
  338                duckdb_settings = self.get_duckdb_settings()
  339                if duckdb_settings:
  340                    for setting in duckdb_settings:
  341                        setting_value = duckdb_settings.get(setting)
  342                        if isinstance(setting_value, str):
  343                            setting_value = f"'{setting_value}'"
  344                        conn.execute(f"PRAGMA {setting}={setting_value};")
  345            elif connexion_format in ["sqlite"]:
  346                conn = sqlite3.connect(connexion_db)
  347
  348        # Set connexion
  349        self.conn = conn
  350
  351        # Log
  352        log.debug(f"connexion_format: {connexion_format}")
  353        log.debug(f"connexion_db: {connexion_db}")
  354        log.debug(f"connexion config: {connexion_config}")
  355        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  356
  357    def set_output(self, output: str = None) -> None:
  358        """
  359        The `set_output` function in Python sets the output file based on the input or a specified key
  360        in the config file, extracting the output name, extension, and format.
  361
  362        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  363        the output file. If the config file has an 'output' key, the method sets the output to the value
  364        of that key. If no output is provided, it sets the output to `None`
  365        :type output: str
  366        """
  367
  368        if output and not isinstance(output, str):
  369            self.output = output.name
  370        else:
  371            self.output = output
  372
  373        # Output format
  374        if self.output:
  375            output_name, output_extension = os.path.splitext(self.output)
  376            self.output_name = output_name
  377            self.output_extension = output_extension
  378            self.output_format = self.output_extension.replace(".", "")
  379        else:
  380            self.output_name = None
  381            self.output_extension = None
  382            self.output_format = None
  383
  384    def set_header(self) -> None:
  385        """
  386        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  387        """
  388
  389        input_file = self.get_input()
  390        default_header_list = [
  391            "##fileformat=VCFv4.2",
  392            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  393        ]
  394
  395        # Full path
  396        input_file = full_path(input_file)
  397
  398        if input_file:
  399
  400            input_format = self.get_input_format()
  401            input_compressed = self.get_input_compressed()
  402            config = self.get_config()
  403            header_list = default_header_list
  404            if input_format in [
  405                "vcf",
  406                "hdr",
  407                "tsv",
  408                "csv",
  409                "psv",
  410                "parquet",
  411                "db",
  412                "duckdb",
  413            ]:
  414                # header provided in param
  415                if config.get("header_file", None):
  416                    with open(config.get("header_file"), "rt") as f:
  417                        header_list = self.read_vcf_header(f)
  418                # within a vcf file format (header within input file itsself)
  419                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  420                    # within a compressed vcf file format (.vcf.gz)
  421                    if input_compressed:
  422                        with bgzf.open(input_file, "rt") as f:
  423                            header_list = self.read_vcf_header(f)
  424                    # within an uncompressed vcf file format (.vcf)
  425                    else:
  426                        with open(input_file, "rt") as f:
  427                            header_list = self.read_vcf_header(f)
  428                # header provided in default external file .hdr
  429                elif os.path.exists((input_file + ".hdr")):
  430                    with open(input_file + ".hdr", "rt") as f:
  431                        header_list = self.read_vcf_header(f)
  432                else:
  433                    try:  # Try to get header info fields and file columns
  434
  435                        with tempfile.TemporaryDirectory() as tmpdir:
  436
  437                            # Create database
  438                            db_for_header = Database(database=input_file)
  439
  440                            # Get header columns for infos fields
  441                            db_header_from_columns = (
  442                                db_for_header.get_header_from_columns()
  443                            )
  444
  445                            # Get real columns in the file
  446                            db_header_columns = db_for_header.get_columns()
  447
  448                            # Write header file
  449                            header_file_tmp = os.path.join(tmpdir, "header")
  450                            f = open(header_file_tmp, "w")
  451                            vcf.Writer(f, db_header_from_columns)
  452                            f.close()
  453
  454                            # Replace #CHROM line with rel columns
  455                            header_list = db_for_header.read_header_file(
  456                                header_file=header_file_tmp
  457                            )
  458                            header_list[-1] = "\t".join(db_header_columns)
  459
  460                    except:
  461
  462                        log.warning(
  463                            f"No header for file {input_file}. Set as default VCF header"
  464                        )
  465                        header_list = default_header_list
  466
  467            else:  # try for unknown format ?
  468
  469                log.error(f"Input file format '{input_format}' not available")
  470                raise ValueError(f"Input file format '{input_format}' not available")
  471
  472            if not header_list:
  473                header_list = default_header_list
  474
  475            # header as list
  476            self.header_list = header_list
  477
  478            # header as VCF object
  479            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  480
  481        else:
  482
  483            self.header_list = None
  484            self.header_vcf = None
  485
  486    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  487        """
  488        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  489        DataFrame based on the connection format.
  490
  491        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  492        represents the SQL query you want to execute. This query will be used to fetch data from a
  493        database and convert it into a pandas DataFrame
  494        :type query: str
  495        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  496        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  497        function will only fetch up to that number of rows from the database query result. If no limit
  498        is specified,
  499        :type limit: int
  500        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  501        """
  502
  503        # Connexion format
  504        connexion_format = self.get_connexion_format()
  505
  506        # Limit in query
  507        if limit:
  508            pd.set_option("display.max_rows", limit)
  509            if connexion_format in ["duckdb"]:
  510                df = (
  511                    self.conn.execute(query)
  512                    .fetch_record_batch(limit)
  513                    .read_next_batch()
  514                    .to_pandas()
  515                )
  516            elif connexion_format in ["sqlite"]:
  517                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  518
  519        # Full query
  520        else:
  521            if connexion_format in ["duckdb"]:
  522                df = self.conn.execute(query).df()
  523            elif connexion_format in ["sqlite"]:
  524                df = pd.read_sql_query(query, self.conn)
  525
  526        return df
  527
  528    def get_overview(self) -> None:
  529        """
  530        The function prints the input, output, config, and dataframe of the current object
  531        """
  532        table_variants_from = self.get_table_variants(clause="from")
  533        sql_columns = self.get_header_columns_as_sql()
  534        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  535        df = self.get_query_to_df(sql_query_export)
  536        log.info(
  537            "Input:  "
  538            + str(self.get_input())
  539            + " ["
  540            + str(str(self.get_input_format()))
  541            + "]"
  542        )
  543        log.info(
  544            "Output: "
  545            + str(self.get_output())
  546            + " ["
  547            + str(str(self.get_output_format()))
  548            + "]"
  549        )
  550        log.info("Config: ")
  551        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  552            "\n"
  553        ):
  554            log.info("\t" + str(d))
  555        log.info("Param: ")
  556        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  557            "\n"
  558        ):
  559            log.info("\t" + str(d))
  560        log.info("Sample list: " + str(self.get_header_sample_list()))
  561        log.info("Dataframe: ")
  562        for d in str(df).split("\n"):
  563            log.info("\t" + str(d))
  564
  565        # garbage collector
  566        del df
  567        gc.collect()
  568
  569        return None
  570
  571    def get_stats(self) -> dict:
  572        """
  573        The `get_stats` function calculates and returns various statistics of the current object,
  574        including information about the input file, variants, samples, header fields, quality, and
  575        SNVs/InDels.
  576        :return: a dictionary containing various statistics of the current object. The dictionary has
  577        the following structure:
  578        """
  579
  580        # Log
  581        log.info(f"Stats Calculation...")
  582
  583        # table varaints
  584        table_variants_from = self.get_table_variants()
  585
  586        # stats dict
  587        stats = {"Infos": {}}
  588
  589        ### File
  590        input_file = self.get_input()
  591        stats["Infos"]["Input file"] = input_file
  592
  593        # Header
  594        header_infos = self.get_header().infos
  595        header_formats = self.get_header().formats
  596        header_infos_list = list(header_infos)
  597        header_formats_list = list(header_formats)
  598
  599        ### Variants
  600
  601        stats["Variants"] = {}
  602
  603        # Variants by chr
  604        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  605        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  606        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  607            by=["CHROM"], kind="quicksort"
  608        )
  609
  610        # Total number of variants
  611        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  612
  613        # Calculate percentage
  614        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  615            lambda x: (x / nb_of_variants)
  616        )
  617
  618        stats["Variants"]["Number of variants by chromosome"] = (
  619            nb_of_variants_by_chrom.to_dict(orient="index")
  620        )
  621
  622        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  623
  624        ### Samples
  625
  626        # Init
  627        samples = {}
  628        nb_of_samples = 0
  629
  630        # Check Samples
  631        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  632            log.debug(f"Check samples...")
  633            for sample in self.get_header_sample_list():
  634                sql_query_samples = f"""
  635                    SELECT  '{sample}' as sample,
  636                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  637                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  638                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  639                    FROM {table_variants_from}
  640                    WHERE (
  641                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  642                        AND
  643                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  644                      )
  645                    GROUP BY genotype
  646                    """
  647                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  648                sample_genotype_count = sql_query_genotype_df["count"].sum()
  649                if len(sql_query_genotype_df):
  650                    nb_of_samples += 1
  651                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  652                        sql_query_genotype_df.to_dict(orient="index")
  653                    )
  654
  655            stats["Samples"] = samples
  656            stats["Infos"]["Number of samples"] = nb_of_samples
  657
  658        # #
  659        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  660        #     stats["Infos"]["Number of samples"] = nb_of_samples
  661        # elif nb_of_samples:
  662        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  663
  664        ### INFO and FORMAT fields
  665        header_types_df = {}
  666        header_types_list = {
  667            "List of INFO fields": header_infos,
  668            "List of FORMAT fields": header_formats,
  669        }
  670        i = 0
  671        for header_type in header_types_list:
  672
  673            header_type_infos = header_types_list.get(header_type)
  674            header_infos_dict = {}
  675
  676            for info in header_type_infos:
  677
  678                i += 1
  679                header_infos_dict[i] = {}
  680
  681                # ID
  682                header_infos_dict[i]["id"] = info
  683
  684                # num
  685                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  686                if header_type_infos[info].num in genotype_map.keys():
  687                    header_infos_dict[i]["Number"] = genotype_map.get(
  688                        header_type_infos[info].num
  689                    )
  690                else:
  691                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  692
  693                # type
  694                if header_type_infos[info].type:
  695                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  696                else:
  697                    header_infos_dict[i]["Type"] = "."
  698
  699                # desc
  700                if header_type_infos[info].desc != None:
  701                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  702                else:
  703                    header_infos_dict[i]["Description"] = ""
  704
  705            if len(header_infos_dict):
  706                header_types_df[header_type] = pd.DataFrame.from_dict(
  707                    header_infos_dict, orient="index"
  708                ).to_dict(orient="index")
  709
  710        # Stats
  711        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  712        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  713        stats["Header"] = header_types_df
  714
  715        ### QUAL
  716        if "QUAL" in self.get_header_columns():
  717            sql_query_qual = f"""
  718                    SELECT
  719                        avg(CAST(QUAL AS INTEGER)) AS Average,
  720                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  721                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  722                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  723                        median(CAST(QUAL AS INTEGER)) AS Median,
  724                        variance(CAST(QUAL AS INTEGER)) AS Variance
  725                    FROM {table_variants_from}
  726                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  727                    """
  728
  729            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  730            stats["Quality"] = {"Stats": qual}
  731
  732        ### SNV and InDel
  733
  734        sql_query_snv = f"""
  735            
  736            SELECT Type, count FROM (
  737
  738                    SELECT
  739                        'Total' AS Type,
  740                        count(*) AS count
  741                    FROM {table_variants_from}
  742
  743                    UNION
  744
  745                    SELECT
  746                        'MNV' AS Type,
  747                        count(*) AS count
  748                    FROM {table_variants_from}
  749                    WHERE len(REF) > 1 AND len(ALT) > 1
  750                    AND len(REF) = len(ALT)
  751
  752                    UNION
  753
  754                    SELECT
  755                        'InDel' AS Type,
  756                        count(*) AS count
  757                    FROM {table_variants_from}
  758                    WHERE len(REF) > 1 OR len(ALT) > 1
  759                    AND len(REF) != len(ALT)
  760                    
  761                    UNION
  762
  763                    SELECT
  764                        'SNV' AS Type,
  765                        count(*) AS count
  766                    FROM {table_variants_from}
  767                    WHERE len(REF) = 1 AND len(ALT) = 1
  768
  769                )
  770
  771            ORDER BY count DESC
  772
  773                """
  774        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  775
  776        sql_query_snv_substitution = f"""
  777                SELECT
  778                    concat(REF, '>', ALT) AS 'Substitution',
  779                    count(*) AS count
  780                FROM {table_variants_from}
  781                WHERE len(REF) = 1 AND len(ALT) = 1
  782                GROUP BY REF, ALT
  783                ORDER BY count(*) DESC
  784                """
  785        snv_substitution = (
  786            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  787        )
  788        stats["Variants"]["Counts"] = snv_indel
  789        stats["Variants"]["Substitutions"] = snv_substitution
  790
  791        return stats
  792
  793    def stats_to_file(self, file: str = None) -> str:
  794        """
  795        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  796        into a JSON object, and writes the JSON object to the specified file.
  797
  798        :param file: The `file` parameter is a string that represents the file path where the JSON data
  799        will be written
  800        :type file: str
  801        :return: the name of the file that was written to.
  802        """
  803
  804        # Get stats
  805        stats = self.get_stats()
  806
  807        # Serializing json
  808        json_object = json.dumps(stats, indent=4)
  809
  810        # Writing to sample.json
  811        with open(file, "w") as outfile:
  812            outfile.write(json_object)
  813
  814        return file
  815
  816    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  817        """
  818        The `print_stats` function generates a markdown file and prints the statistics contained in a
  819        JSON file in a formatted manner.
  820
  821        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  822        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  823        provided, a temporary directory will be created and the stats will be saved in a file named
  824        "stats.md" within that
  825        :type output_file: str
  826        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  827        file where the statistics will be saved. If no value is provided, a temporary directory will be
  828        created and a default file name "stats.json" will be used
  829        :type json_file: str
  830        :return: The function `print_stats` does not return any value. It has a return type annotation
  831        of `None`.
  832        """
  833
  834        # Full path
  835        output_file = full_path(output_file)
  836        json_file = full_path(json_file)
  837
  838        with tempfile.TemporaryDirectory() as tmpdir:
  839
  840            # Files
  841            if not output_file:
  842                output_file = os.path.join(tmpdir, "stats.md")
  843            if not json_file:
  844                json_file = os.path.join(tmpdir, "stats.json")
  845
  846            # Create folders
  847            if not os.path.exists(os.path.dirname(output_file)):
  848                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  849            if not os.path.exists(os.path.dirname(json_file)):
  850                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  851
  852            # Create stats JSON file
  853            stats_file = self.stats_to_file(file=json_file)
  854
  855            # Print stats file
  856            with open(stats_file) as f:
  857                stats = yaml.safe_load(f)
  858
  859            # Output
  860            output_title = []
  861            output_index = []
  862            output = []
  863
  864            # Title
  865            output_title.append("# HOWARD Stats")
  866
  867            # Index
  868            output_index.append("## Index")
  869
  870            # Process sections
  871            for section in stats:
  872                infos = stats.get(section)
  873                section_link = "#" + section.lower().replace(" ", "-")
  874                output.append(f"## {section}")
  875                output_index.append(f"- [{section}]({section_link})")
  876
  877                if len(infos):
  878                    for info in infos:
  879                        try:
  880                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  881                            is_df = True
  882                        except:
  883                            try:
  884                                df = pd.DataFrame.from_dict(
  885                                    json.loads((infos.get(info))), orient="index"
  886                                )
  887                                is_df = True
  888                            except:
  889                                is_df = False
  890                        if is_df:
  891                            output.append(f"### {info}")
  892                            info_link = "#" + info.lower().replace(" ", "-")
  893                            output_index.append(f"   - [{info}]({info_link})")
  894                            output.append(f"{df.to_markdown(index=False)}")
  895                        else:
  896                            output.append(f"- {info}: {infos.get(info)}")
  897                else:
  898                    output.append(f"NA")
  899
  900            # Write stats in markdown file
  901            with open(output_file, "w") as fp:
  902                for item in output_title:
  903                    fp.write("%s\n" % item)
  904                for item in output_index:
  905                    fp.write("%s\n" % item)
  906                for item in output:
  907                    fp.write("%s\n" % item)
  908
  909            # Output stats in markdown
  910            print("")
  911            print("\n\n".join(output_title))
  912            print("")
  913            print("\n\n".join(output))
  914            print("")
  915
  916        return None
  917
  918    def get_input(self) -> str:
  919        """
  920        It returns the value of the input variable.
  921        :return: The input is being returned.
  922        """
  923        return self.input
  924
  925    def get_input_format(self, input_file: str = None) -> str:
  926        """
  927        This function returns the format of the input variable, either from the provided input file or
  928        by prompting for input.
  929
  930        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  931        represents the file path of the input file. If no `input_file` is provided when calling the
  932        method, it will default to `None`
  933        :type input_file: str
  934        :return: The format of the input variable is being returned.
  935        """
  936
  937        if not input_file:
  938            input_file = self.get_input()
  939        input_format = get_file_format(input_file)
  940        return input_format
  941
  942    def get_input_compressed(self, input_file: str = None) -> str:
  943        """
  944        The function `get_input_compressed` returns the format of the input variable after compressing
  945        it.
  946
  947        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  948        that represents the file path of the input file. If no `input_file` is provided when calling the
  949        method, it will default to `None` and the method will then call `self.get_input()` to
  950        :type input_file: str
  951        :return: The function `get_input_compressed` returns the compressed format of the input
  952        variable.
  953        """
  954
  955        if not input_file:
  956            input_file = self.get_input()
  957        input_compressed = get_file_compressed(input_file)
  958        return input_compressed
  959
  960    def get_output(self) -> str:
  961        """
  962        It returns the output of the neuron.
  963        :return: The output of the neural network.
  964        """
  965
  966        return self.output
  967
  968    def get_output_format(self, output_file: str = None) -> str:
  969        """
  970        The function `get_output_format` returns the format of the input variable or the output file if
  971        provided.
  972
  973        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  974        that represents the file path of the output file. If no `output_file` is provided when calling
  975        the method, it will default to the output obtained from the `get_output` method of the class
  976        instance. The
  977        :type output_file: str
  978        :return: The format of the input variable is being returned.
  979        """
  980
  981        if not output_file:
  982            output_file = self.get_output()
  983        output_format = get_file_format(output_file)
  984
  985        return output_format
  986
  987    def get_config(self) -> dict:
  988        """
  989        It returns the config
  990        :return: The config variable is being returned.
  991        """
  992        return self.config
  993
  994    def get_param(self) -> dict:
  995        """
  996        It returns the param
  997        :return: The param variable is being returned.
  998        """
  999        return self.param
 1000
 1001    def get_connexion_db(self) -> str:
 1002        """
 1003        It returns the connexion_db attribute of the object
 1004        :return: The connexion_db is being returned.
 1005        """
 1006        return self.connexion_db
 1007
 1008    def get_prefix(self) -> str:
 1009        """
 1010        It returns the prefix of the object.
 1011        :return: The prefix is being returned.
 1012        """
 1013        return self.prefix
 1014
 1015    def get_table_variants(self, clause: str = "select") -> str:
 1016        """
 1017        This function returns the table_variants attribute of the object
 1018
 1019        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1020        defaults to select (optional)
 1021        :return: The table_variants attribute of the object.
 1022        """
 1023
 1024        # Access
 1025        access = self.get_config().get("access", None)
 1026
 1027        # Clauses "select", "where", "update"
 1028        if clause in ["select", "where", "update"]:
 1029            table_variants = self.table_variants
 1030        # Clause "from"
 1031        elif clause in ["from"]:
 1032            # For Read Only
 1033            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1034                input_file = self.get_input()
 1035                table_variants = f"'{input_file}' as variants"
 1036            # For Read Write
 1037            else:
 1038                table_variants = f"{self.table_variants} as variants"
 1039        else:
 1040            table_variants = self.table_variants
 1041        return table_variants
 1042
 1043    def get_tmp_dir(self) -> str:
 1044        """
 1045        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1046        parameters or a default path.
 1047        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1048        configuration, parameters, and a default value of "/tmp".
 1049        """
 1050
 1051        return get_tmp(
 1052            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1053        )
 1054
 1055    def get_connexion_type(self) -> str:
 1056        """
 1057        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1058
 1059        :return: The connexion type is being returned.
 1060        """
 1061        return self.get_config().get("connexion_type", "memory")
 1062
 1063    def get_connexion(self):
 1064        """
 1065        It returns the connection object
 1066
 1067        :return: The connection object.
 1068        """
 1069        return self.conn
 1070
 1071    def close_connexion(self) -> None:
 1072        """
 1073        This function closes the connection to the database.
 1074        :return: The connection is being closed.
 1075        """
 1076        return self.conn.close()
 1077
 1078    def get_header(self, type: str = "vcf"):
 1079        """
 1080        This function returns the header of the VCF file as a list of strings
 1081
 1082        :param type: the type of header you want to get, defaults to vcf (optional)
 1083        :return: The header of the vcf file.
 1084        """
 1085
 1086        if self.header_vcf:
 1087            if type == "vcf":
 1088                return self.header_vcf
 1089            elif type == "list":
 1090                return self.header_list
 1091        else:
 1092            if type == "vcf":
 1093                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1094                return header
 1095            elif type == "list":
 1096                return vcf_required
 1097
 1098    def get_header_infos_list(self) -> list:
 1099        """
 1100        This function retrieves a list of information fields from the header.
 1101        :return: A list of information fields from the header.
 1102        """
 1103
 1104        # Init
 1105        infos_list = []
 1106
 1107        for field in self.get_header().infos:
 1108            infos_list.append(field)
 1109
 1110        return infos_list
 1111
 1112    def get_header_length(self, file: str = None) -> int:
 1113        """
 1114        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1115        line.
 1116
 1117        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1118        header file. If this argument is provided, the function will read the header from the specified
 1119        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1120        :type file: str
 1121        :return: the length of the header list, excluding the #CHROM line.
 1122        """
 1123
 1124        if file:
 1125            return len(self.read_vcf_header_file(file=file)) - 1
 1126        elif self.get_header(type="list"):
 1127            return len(self.get_header(type="list")) - 1
 1128        else:
 1129            return 0
 1130
 1131    def get_header_columns(self) -> str:
 1132        """
 1133        This function returns the header list of a VCF
 1134
 1135        :return: The length of the header list.
 1136        """
 1137        if self.get_header():
 1138            return self.get_header(type="list")[-1]
 1139        else:
 1140            return ""
 1141
 1142    def get_header_columns_as_list(self) -> list:
 1143        """
 1144        This function returns the header list of a VCF
 1145
 1146        :return: The length of the header list.
 1147        """
 1148        if self.get_header():
 1149            return self.get_header_columns().strip().split("\t")
 1150        else:
 1151            return []
 1152
 1153    def get_header_columns_as_sql(self) -> str:
 1154        """
 1155        This function retruns header length (without #CHROM line)
 1156
 1157        :return: The length of the header list.
 1158        """
 1159        sql_column_list = []
 1160        for col in self.get_header_columns_as_list():
 1161            sql_column_list.append(f'"{col}"')
 1162        return ",".join(sql_column_list)
 1163
 1164    def get_header_sample_list(
 1165        self, check: bool = False, samples: list = None, samples_force: bool = False
 1166    ) -> list:
 1167        """
 1168        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1169        checking and filtering based on input parameters.
 1170
 1171        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1172        parameter that determines whether to check if the samples in the list are properly defined as
 1173        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1174        list is defined as a, defaults to False
 1175        :type check: bool (optional)
 1176        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1177        allows you to specify a subset of samples from the header. If you provide a list of sample
 1178        names, the function will check if each sample is defined in the header. If a sample is not found
 1179        in the
 1180        :type samples: list
 1181        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1182        a boolean parameter that determines whether to force the function to return the sample list
 1183        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1184        function will return the sample list without performing, defaults to False
 1185        :type samples_force: bool (optional)
 1186        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1187        parameters and conditions specified in the function.
 1188        """
 1189
 1190        # Init
 1191        samples_list = []
 1192
 1193        if samples is None:
 1194            samples_list = self.header_vcf.samples
 1195        else:
 1196            samples_checked = []
 1197            for sample in samples:
 1198                if sample in self.header_vcf.samples:
 1199                    samples_checked.append(sample)
 1200                else:
 1201                    log.warning(f"Sample '{sample}' not defined in header")
 1202            samples_list = samples_checked
 1203
 1204            # Force sample list without checking if is_genotype_column
 1205            if samples_force:
 1206                log.warning(f"Samples {samples_list} not checked if genotypes")
 1207                return samples_list
 1208
 1209        if check:
 1210            samples_checked = []
 1211            for sample in samples_list:
 1212                if self.is_genotype_column(column=sample):
 1213                    samples_checked.append(sample)
 1214                else:
 1215                    log.warning(
 1216                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1217                    )
 1218            samples_list = samples_checked
 1219
 1220        # Return samples list
 1221        return samples_list
 1222
 1223    def is_genotype_column(self, column: str = None) -> bool:
 1224        """
 1225        This function checks if a given column is a genotype column in a database.
 1226
 1227        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1228        represents the column name in a database table. This method checks if the specified column is a
 1229        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1230        method of
 1231        :type column: str
 1232        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1233        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1234        column name and returns the result. If the `column` parameter is None, it returns False.
 1235        """
 1236
 1237        if column is not None:
 1238            return Database(database=self.get_input()).is_genotype_column(column=column)
 1239        else:
 1240            return False
 1241
 1242    def get_verbose(self) -> bool:
 1243        """
 1244        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1245        exist
 1246
 1247        :return: The value of the key "verbose" in the config dictionary.
 1248        """
 1249        return self.get_config().get("verbose", False)
 1250
 1251    def get_connexion_format(self) -> str:
 1252        """
 1253        It returns the connexion format of the object.
 1254        :return: The connexion_format is being returned.
 1255        """
 1256        connexion_format = self.connexion_format
 1257        if connexion_format not in ["duckdb", "sqlite"]:
 1258            log.error(f"Unknown connexion format {connexion_format}")
 1259            raise ValueError(f"Unknown connexion format {connexion_format}")
 1260        else:
 1261            return connexion_format
 1262
 1263    def insert_file_to_table(
 1264        self,
 1265        file,
 1266        columns: str,
 1267        header_len: int = 0,
 1268        sep: str = "\t",
 1269        chunksize: int = 1000000,
 1270    ) -> None:
 1271        """
 1272        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1273        database format.
 1274
 1275        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1276        the path to the file on your system
 1277        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1278        should contain the names of the columns in the table where the data will be inserted. The column
 1279        names should be separated by commas within the string. For example, if you have columns named
 1280        "id", "name
 1281        :type columns: str
 1282        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1283        the number of lines to skip at the beginning of the file before reading the actual data. This
 1284        parameter allows you to skip any header information present in the file before processing the
 1285        data, defaults to 0
 1286        :type header_len: int (optional)
 1287        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1288        separator character that is used in the file being read. In this case, the default separator is
 1289        set to `\t`, which represents a tab character. You can change this parameter to a different
 1290        separator character if, defaults to \t
 1291        :type sep: str (optional)
 1292        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1293        when processing the file in chunks. In the provided code snippet, the default value for
 1294        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1295        to 1000000
 1296        :type chunksize: int (optional)
 1297        """
 1298
 1299        # Config
 1300        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1301        connexion_format = self.get_connexion_format()
 1302
 1303        log.debug("chunksize: " + str(chunksize))
 1304
 1305        if chunksize:
 1306            for chunk in pd.read_csv(
 1307                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1308            ):
 1309                if connexion_format in ["duckdb"]:
 1310                    sql_insert_into = (
 1311                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1312                    )
 1313                    self.conn.execute(sql_insert_into)
 1314                elif connexion_format in ["sqlite"]:
 1315                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1316
 1317    def load_data(
 1318        self,
 1319        input_file: str = None,
 1320        drop_variants_table: bool = False,
 1321        sample_size: int = 20480,
 1322    ) -> None:
 1323        """
 1324        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1325        table before loading the data and specify a sample size.
 1326
 1327        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1328        table
 1329        :type input_file: str
 1330        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1331        determines whether the variants table should be dropped before loading the data. If set to
 1332        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1333        not be dropped, defaults to False
 1334        :type drop_variants_table: bool (optional)
 1335        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1336        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1337        20480
 1338        :type sample_size: int (optional)
 1339        """
 1340
 1341        log.info("Loading...")
 1342
 1343        # change input file
 1344        if input_file:
 1345            self.set_input(input_file)
 1346            self.set_header()
 1347
 1348        # drop variants table
 1349        if drop_variants_table:
 1350            self.drop_variants_table()
 1351
 1352        # get table variants
 1353        table_variants = self.get_table_variants()
 1354
 1355        # Access
 1356        access = self.get_config().get("access", None)
 1357        log.debug(f"access: {access}")
 1358
 1359        # Input format and compress
 1360        input_format = self.get_input_format()
 1361        input_compressed = self.get_input_compressed()
 1362        log.debug(f"input_format: {input_format}")
 1363        log.debug(f"input_compressed: {input_compressed}")
 1364
 1365        # input_compressed_format
 1366        if input_compressed:
 1367            input_compressed_format = "gzip"
 1368        else:
 1369            input_compressed_format = "none"
 1370        log.debug(f"input_compressed_format: {input_compressed_format}")
 1371
 1372        # Connexion format
 1373        connexion_format = self.get_connexion_format()
 1374
 1375        # Sample size
 1376        if not sample_size:
 1377            sample_size = -1
 1378        log.debug(f"sample_size: {sample_size}")
 1379
 1380        # Load data
 1381        log.debug(f"Load Data from {input_format}")
 1382
 1383        # DuckDB connexion
 1384        if connexion_format in ["duckdb"]:
 1385
 1386            # Database already exists
 1387            if self.input_format in ["db", "duckdb"]:
 1388
 1389                if connexion_format in ["duckdb"]:
 1390                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1391                else:
 1392                    log.error(
 1393                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1394                    )
 1395                    raise ValueError(
 1396                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1397                    )
 1398
 1399            # Load from existing database format
 1400            else:
 1401
 1402                try:
 1403                    # Create Table or View
 1404                    database = Database(database=self.input)
 1405                    sql_from = database.get_sql_from(sample_size=sample_size)
 1406
 1407                    if access in ["RO"]:
 1408                        sql_load = (
 1409                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1410                        )
 1411                    else:
 1412                        sql_load = (
 1413                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1414                        )
 1415                    self.conn.execute(sql_load)
 1416
 1417                except:
 1418                    # Format not available
 1419                    log.error(f"Input file format '{self.input_format}' not available")
 1420                    raise ValueError(
 1421                        f"Input file format '{self.input_format}' not available"
 1422                    )
 1423
 1424        # SQLite connexion
 1425        elif connexion_format in ["sqlite"] and input_format in [
 1426            "vcf",
 1427            "tsv",
 1428            "csv",
 1429            "psv",
 1430        ]:
 1431
 1432            # Main structure
 1433            structure = {
 1434                "#CHROM": "VARCHAR",
 1435                "POS": "INTEGER",
 1436                "ID": "VARCHAR",
 1437                "REF": "VARCHAR",
 1438                "ALT": "VARCHAR",
 1439                "QUAL": "VARCHAR",
 1440                "FILTER": "VARCHAR",
 1441                "INFO": "VARCHAR",
 1442            }
 1443
 1444            # Strcuture with samples
 1445            structure_complete = structure
 1446            if self.get_header_sample_list():
 1447                structure["FORMAT"] = "VARCHAR"
 1448                for sample in self.get_header_sample_list():
 1449                    structure_complete[sample] = "VARCHAR"
 1450
 1451            # Columns list for create and insert
 1452            sql_create_table_columns = []
 1453            sql_create_table_columns_list = []
 1454            for column in structure_complete:
 1455                column_type = structure_complete[column]
 1456                sql_create_table_columns.append(
 1457                    f'"{column}" {column_type} default NULL'
 1458                )
 1459                sql_create_table_columns_list.append(f'"{column}"')
 1460
 1461            # Create database
 1462            log.debug(f"Create Table {table_variants}")
 1463            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1464            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1465            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1466            self.conn.execute(sql_create_table)
 1467
 1468            # chunksize define length of file chunk load file
 1469            chunksize = 100000
 1470
 1471            # delimiter
 1472            delimiter = file_format_delimiters.get(input_format, "\t")
 1473
 1474            # Load the input file
 1475            with open(self.input, "rt") as input_file:
 1476
 1477                # Use the appropriate file handler based on the input format
 1478                if input_compressed:
 1479                    input_file = bgzf.open(self.input, "rt")
 1480                if input_format in ["vcf"]:
 1481                    header_len = self.get_header_length()
 1482                else:
 1483                    header_len = 0
 1484
 1485                # Insert the file contents into a table
 1486                self.insert_file_to_table(
 1487                    input_file,
 1488                    columns=sql_create_table_columns_list_sql,
 1489                    header_len=header_len,
 1490                    sep=delimiter,
 1491                    chunksize=chunksize,
 1492                )
 1493
 1494        else:
 1495            log.error(
 1496                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1497            )
 1498            raise ValueError(
 1499                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1500            )
 1501
 1502        # Explode INFOS fields into table fields
 1503        if self.get_explode_infos():
 1504            self.explode_infos(
 1505                prefix=self.get_explode_infos_prefix(),
 1506                fields=self.get_explode_infos_fields(),
 1507                force=True,
 1508            )
 1509
 1510        # Create index after insertion
 1511        self.create_indexes()
 1512
 1513    def get_explode_infos(self) -> bool:
 1514        """
 1515        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1516        to False if it is not set.
 1517        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1518        value. If the parameter is not present, it will return False.
 1519        """
 1520
 1521        return self.get_param().get("explode", {}).get("explode_infos", False)
 1522
 1523    def get_explode_infos_fields(
 1524        self,
 1525        explode_infos_fields: str = None,
 1526        remove_fields_not_in_header: bool = False,
 1527    ) -> list:
 1528        """
 1529        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1530        the input parameter `explode_infos_fields`.
 1531
 1532        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1533        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1534        comma-separated list of field names to explode
 1535        :type explode_infos_fields: str
 1536        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1537        flag that determines whether to remove fields that are not present in the header. If it is set
 1538        to `True`, any field that is not in the header will be excluded from the list of exploded
 1539        information fields. If it is set to `, defaults to False
 1540        :type remove_fields_not_in_header: bool (optional)
 1541        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1542        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1543        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1544        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1545        splitting the string by commas.
 1546        """
 1547
 1548        # If no fields, get it in param
 1549        if not explode_infos_fields:
 1550            explode_infos_fields = (
 1551                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1552            )
 1553
 1554        # If no fields, defined as all fields in header using keyword
 1555        if not explode_infos_fields:
 1556            explode_infos_fields = "*"
 1557
 1558        # If fields list not empty
 1559        if explode_infos_fields:
 1560
 1561            # Input fields list
 1562            if isinstance(explode_infos_fields, str):
 1563                fields_input = explode_infos_fields.split(",")
 1564            elif isinstance(explode_infos_fields, list):
 1565                fields_input = explode_infos_fields
 1566            else:
 1567                fields_input = []
 1568
 1569            # Fields list without * keyword
 1570            fields_without_all = fields_input.copy()
 1571            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1572                fields_without_all.remove("*")
 1573
 1574            # Fields in header
 1575            fields_in_header = sorted(list(set(self.get_header().infos)))
 1576
 1577            # Construct list of fields
 1578            fields_output = []
 1579            for field in fields_input:
 1580
 1581                # Strip field
 1582                field = field.strip()
 1583
 1584                # format keyword * in regex
 1585                if field.upper() in ["*"]:
 1586                    field = ".*"
 1587
 1588                # Find all fields with pattern
 1589                r = re.compile(field)
 1590                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1591
 1592                # Remove fields input from search
 1593                if field in fields_search:
 1594                    fields_search = [field]
 1595                elif fields_search != [field]:
 1596                    fields_search = sorted(
 1597                        list(set(fields_search).difference(fields_input))
 1598                    )
 1599
 1600                # If field is not in header (avoid not well formatted header)
 1601                if not fields_search and not remove_fields_not_in_header:
 1602                    fields_search = [field]
 1603
 1604                # Add found fields
 1605                for new_field in fields_search:
 1606                    # Add field, if not already exists, and if it is in header (if asked)
 1607                    if (
 1608                        new_field not in fields_output
 1609                        and (
 1610                            not remove_fields_not_in_header
 1611                            or new_field in fields_in_header
 1612                        )
 1613                        and new_field not in [".*"]
 1614                    ):
 1615                        fields_output.append(new_field)
 1616
 1617            return fields_output
 1618
 1619        else:
 1620
 1621            return []
 1622
 1623    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1624        """
 1625        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1626        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1627        not provided.
 1628
 1629        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1630        prefix to be used for exploding or expanding information
 1631        :type explode_infos_prefix: str
 1632        :return: the value of the variable `explode_infos_prefix`.
 1633        """
 1634
 1635        if not explode_infos_prefix:
 1636            explode_infos_prefix = (
 1637                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1638            )
 1639
 1640        return explode_infos_prefix
 1641
 1642    def add_column(
 1643        self,
 1644        table_name,
 1645        column_name,
 1646        column_type,
 1647        default_value=None,
 1648        drop: bool = False,
 1649    ) -> dict:
 1650        """
 1651        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1652        doesn't already exist.
 1653
 1654        :param table_name: The name of the table to which you want to add a column
 1655        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1656        to the table
 1657        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1658        want to add to the table. It should be a string that represents the desired data type, such as
 1659        "INTEGER", "TEXT", "REAL", etc
 1660        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1661        default value for the newly added column. If a default value is provided, it will be assigned to
 1662        the column for any existing rows that do not have a value for that column
 1663        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1664        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1665        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1666        to False
 1667        :type drop: bool (optional)
 1668        :return: a boolean value indicating whether the column was successfully added to the table.
 1669        """
 1670
 1671        # added
 1672        added = False
 1673        dropped = False
 1674
 1675        # Check if the column already exists in the table
 1676        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1677        columns = self.get_query_to_df(query).columns.tolist()
 1678        if column_name.upper() in [c.upper() for c in columns]:
 1679            log.debug(
 1680                f"The {column_name} column already exists in the {table_name} table"
 1681            )
 1682            if drop:
 1683                self.drop_column(table_name=table_name, column_name=column_name)
 1684                dropped = True
 1685            else:
 1686                return None
 1687        else:
 1688            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1689
 1690        # Add column in table
 1691        add_column_query = (
 1692            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1693        )
 1694        if default_value is not None:
 1695            add_column_query += f" DEFAULT {default_value}"
 1696        self.execute_query(add_column_query)
 1697        added = not dropped
 1698        log.debug(
 1699            f"The {column_name} column was successfully added to the {table_name} table"
 1700        )
 1701
 1702        if added:
 1703            added_column = {
 1704                "table_name": table_name,
 1705                "column_name": column_name,
 1706                "column_type": column_type,
 1707                "default_value": default_value,
 1708            }
 1709        else:
 1710            added_column = None
 1711
 1712        return added_column
 1713
 1714    def drop_column(
 1715        self, column: dict = None, table_name: str = None, column_name: str = None
 1716    ) -> bool:
 1717        """
 1718        The `drop_column` function drops a specified column from a given table in a database and returns
 1719        True if the column was successfully dropped, and False if the column does not exist in the
 1720        table.
 1721
 1722        :param column: The `column` parameter is a dictionary that contains information about the column
 1723        you want to drop. It has two keys:
 1724        :type column: dict
 1725        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1726        drop a column
 1727        :type table_name: str
 1728        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1729        from the table
 1730        :type column_name: str
 1731        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1732        and False if the column does not exist in the table.
 1733        """
 1734
 1735        # Find column infos
 1736        if column:
 1737            if isinstance(column, dict):
 1738                table_name = column.get("table_name", None)
 1739                column_name = column.get("column_name", None)
 1740            elif isinstance(column, str):
 1741                table_name = self.get_table_variants()
 1742                column_name = column
 1743            else:
 1744                table_name = None
 1745                column_name = None
 1746
 1747        if not table_name and not column_name:
 1748            return False
 1749
 1750        # Removed
 1751        removed = False
 1752
 1753        # Check if the column already exists in the table
 1754        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1755        columns = self.get_query_to_df(query).columns.tolist()
 1756        if column_name in columns:
 1757            log.debug(f"The {column_name} column exists in the {table_name} table")
 1758        else:
 1759            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1760            return False
 1761
 1762        # Add column in table # ALTER TABLE integers DROP k
 1763        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1764        self.execute_query(add_column_query)
 1765        removed = True
 1766        log.debug(
 1767            f"The {column_name} column was successfully dropped to the {table_name} table"
 1768        )
 1769
 1770        return removed
 1771
 1772    def explode_infos(
 1773        self,
 1774        prefix: str = None,
 1775        create_index: bool = False,
 1776        fields: list = None,
 1777        force: bool = False,
 1778        proccess_all_fields_together: bool = False,
 1779        table: str = None,
 1780    ) -> list:
 1781        """
 1782        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1783        individual columns, returning a list of added columns.
 1784
 1785        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1786        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1787        `self.get_explode_infos_prefix()` as the prefix
 1788        :type prefix: str
 1789        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1790        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1791        `False`, indexes will not be created. The default value is `False`, defaults to False
 1792        :type create_index: bool (optional)
 1793        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1794        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1795        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1796        a list to the `
 1797        :type fields: list
 1798        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1799        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1800        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1801        defaults to False
 1802        :type force: bool (optional)
 1803        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1804        flag that determines whether to process all the INFO fields together or individually. If set to
 1805        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1806        be processed individually. The default value is, defaults to False
 1807        :type proccess_all_fields_together: bool (optional)
 1808        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1809        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1810        a value for the `table` parameter, the function will use that table name. If the `table`
 1811        parameter is
 1812        :type table: str
 1813        :return: The `explode_infos` function returns a list of added columns.
 1814        """
 1815
 1816        # drop indexes
 1817        self.drop_indexes()
 1818
 1819        # connexion format
 1820        connexion_format = self.get_connexion_format()
 1821
 1822        # Access
 1823        access = self.get_config().get("access", None)
 1824
 1825        # Added columns
 1826        added_columns = []
 1827
 1828        if access not in ["RO"]:
 1829
 1830            # prefix
 1831            if prefix in [None, True] or not isinstance(prefix, str):
 1832                if self.get_explode_infos_prefix() not in [None, True]:
 1833                    prefix = self.get_explode_infos_prefix()
 1834                else:
 1835                    prefix = "INFO/"
 1836
 1837            # table variants
 1838            if table is not None:
 1839                table_variants = table
 1840            else:
 1841                table_variants = self.get_table_variants(clause="select")
 1842
 1843            # extra infos
 1844            try:
 1845                extra_infos = self.get_extra_infos()
 1846            except:
 1847                extra_infos = []
 1848
 1849            # Header infos
 1850            header_infos = self.get_header().infos
 1851
 1852            log.debug(
 1853                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1854            )
 1855
 1856            sql_info_alter_table_array = []
 1857
 1858            # Info fields to check
 1859            fields_list = list(header_infos)
 1860            if fields:
 1861                fields_list += fields
 1862            fields_list = set(fields_list)
 1863
 1864            # If no fields
 1865            if not fields:
 1866                fields = []
 1867
 1868            # Translate fields if patterns
 1869            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1870
 1871            for info in fields:
 1872
 1873                info_id_sql = prefix + info
 1874
 1875                if (
 1876                    info in fields_list
 1877                    or prefix + info in fields_list
 1878                    or info in extra_infos
 1879                ):
 1880
 1881                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1882
 1883                    if info in header_infos:
 1884                        info_type = header_infos[info].type
 1885                        info_num = header_infos[info].num
 1886                    else:
 1887                        info_type = "String"
 1888                        info_num = 0
 1889
 1890                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1891                    if info_num != 1:
 1892                        type_sql = "VARCHAR"
 1893
 1894                    # Add field
 1895                    added_column = self.add_column(
 1896                        table_name=table_variants,
 1897                        column_name=info_id_sql,
 1898                        column_type=type_sql,
 1899                        default_value="null",
 1900                        drop=force,
 1901                    )
 1902
 1903                    if added_column:
 1904                        added_columns.append(added_column)
 1905
 1906                    if added_column or force:
 1907
 1908                        # add field to index
 1909                        self.index_additionnal_fields.append(info_id_sql)
 1910
 1911                        # Update field array
 1912                        if connexion_format in ["duckdb"]:
 1913                            update_info_field = f"""
 1914                            "{info_id_sql}" =
 1915                                CASE
 1916                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1917                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1918                                END
 1919                            """
 1920                        elif connexion_format in ["sqlite"]:
 1921                            update_info_field = f"""
 1922                                "{info_id_sql}" =
 1923                                    CASE
 1924                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1925                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1926                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1927                                    END
 1928                            """
 1929
 1930                        sql_info_alter_table_array.append(update_info_field)
 1931
 1932            if sql_info_alter_table_array:
 1933
 1934                # By chromosomes
 1935                try:
 1936                    chromosomes_list = list(
 1937                        self.get_query_to_df(
 1938                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1939                        )["#CHROM"]
 1940                    )
 1941                except:
 1942                    chromosomes_list = [None]
 1943
 1944                for chrom in chromosomes_list:
 1945                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1946
 1947                    # Where clause
 1948                    where_clause = ""
 1949                    if chrom and len(chromosomes_list) > 1:
 1950                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1951
 1952                    # Update table
 1953                    if proccess_all_fields_together:
 1954                        sql_info_alter_table_array_join = ", ".join(
 1955                            sql_info_alter_table_array
 1956                        )
 1957                        if sql_info_alter_table_array_join:
 1958                            sql_info_alter_table = f"""
 1959                                UPDATE {table_variants}
 1960                                SET {sql_info_alter_table_array_join}
 1961                                {where_clause}
 1962                                """
 1963                            log.debug(
 1964                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1965                            )
 1966                            # log.debug(sql_info_alter_table)
 1967                            self.conn.execute(sql_info_alter_table)
 1968                    else:
 1969                        sql_info_alter_num = 0
 1970                        for sql_info_alter in sql_info_alter_table_array:
 1971                            sql_info_alter_num += 1
 1972                            sql_info_alter_table = f"""
 1973                                UPDATE {table_variants}
 1974                                SET {sql_info_alter}
 1975                                {where_clause}
 1976                                """
 1977                            log.debug(
 1978                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1979                            )
 1980                            # log.debug(sql_info_alter_table)
 1981                            self.conn.execute(sql_info_alter_table)
 1982
 1983        # create indexes
 1984        if create_index:
 1985            self.create_indexes()
 1986
 1987        return added_columns
 1988
 1989    def create_indexes(self) -> None:
 1990        """
 1991        Create indexes on the table after insertion
 1992        """
 1993
 1994        # Access
 1995        access = self.get_config().get("access", None)
 1996
 1997        # get table variants
 1998        table_variants = self.get_table_variants("FROM")
 1999
 2000        if self.get_indexing() and access not in ["RO"]:
 2001            # Create index
 2002            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2003            self.conn.execute(sql_create_table_index)
 2004            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2005            self.conn.execute(sql_create_table_index)
 2006            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2007            self.conn.execute(sql_create_table_index)
 2008            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2009            self.conn.execute(sql_create_table_index)
 2010            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2011            self.conn.execute(sql_create_table_index)
 2012            for field in self.index_additionnal_fields:
 2013                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2014                self.conn.execute(sql_create_table_index)
 2015
 2016    def drop_indexes(self) -> None:
 2017        """
 2018        Create indexes on the table after insertion
 2019        """
 2020
 2021        # Access
 2022        access = self.get_config().get("access", None)
 2023
 2024        # get table variants
 2025        table_variants = self.get_table_variants("FROM")
 2026
 2027        # Get database format
 2028        connexion_format = self.get_connexion_format()
 2029
 2030        if access not in ["RO"]:
 2031            if connexion_format in ["duckdb"]:
 2032                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2033            elif connexion_format in ["sqlite"]:
 2034                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2035
 2036            list_indexes = self.conn.execute(sql_list_indexes)
 2037            index_names = [row[0] for row in list_indexes.fetchall()]
 2038            for index in index_names:
 2039                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2040                self.conn.execute(sql_drop_table_index)
 2041
 2042    def read_vcf_header(self, f) -> list:
 2043        """
 2044        It reads the header of a VCF file and returns a list of the header lines
 2045
 2046        :param f: the file object
 2047        :return: The header lines of the VCF file.
 2048        """
 2049
 2050        header_list = []
 2051        for line in f:
 2052            header_list.append(line)
 2053            if line.startswith("#CHROM"):
 2054                break
 2055        return header_list
 2056
 2057    def read_vcf_header_file(self, file: str = None) -> list:
 2058        """
 2059        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2060        uncompressed files.
 2061
 2062        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2063        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2064        default to `None`
 2065        :type file: str
 2066        :return: The function `read_vcf_header_file` returns a list.
 2067        """
 2068
 2069        if self.get_input_compressed(input_file=file):
 2070            with bgzf.open(file, "rt") as f:
 2071                return self.read_vcf_header(f=f)
 2072        else:
 2073            with open(file, "rt") as f:
 2074                return self.read_vcf_header(f=f)
 2075
 2076    def execute_query(self, query: str):
 2077        """
 2078        It takes a query as an argument, executes it, and returns the results
 2079
 2080        :param query: The query to be executed
 2081        :return: The result of the query is being returned.
 2082        """
 2083        if query:
 2084            return self.conn.execute(query)  # .fetchall()
 2085        else:
 2086            return None
 2087
 2088    def export_output(
 2089        self,
 2090        output_file: str | None = None,
 2091        output_header: str | None = None,
 2092        export_header: bool = True,
 2093        query: str | None = None,
 2094        parquet_partitions: list | None = None,
 2095        chunk_size: int | None = None,
 2096        threads: int | None = None,
 2097        sort: bool = False,
 2098        index: bool = False,
 2099        order_by: str | None = None,
 2100        fields_to_rename: dict | None = None
 2101    ) -> bool:
 2102        """
 2103        The `export_output` function exports data from a VCF file to various formats, including VCF,
 2104        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
 2105        partitioning.
 2106        
 2107        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2108        output file where the exported data will be saved
 2109        :type output_file: str | None
 2110        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2111        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2112        header will be exported to a file with the same name as the `output_file` parameter, but with
 2113        the extension "
 2114        :type output_header: str | None
 2115        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2116        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2117        True, the header will be exported to a file. If `export_header` is False, the header will not
 2118        be, defaults to True
 2119        :type export_header: bool (optional)
 2120        :param query: The `query` parameter in the `export_output` function is an optional SQL query
 2121        that can be used to filter and select specific data from the VCF file before exporting it. If
 2122        provided, only the data that matches the query will be exported. This allows you to customize
 2123        the exported data based on
 2124        :type query: str | None
 2125        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2126        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2127        organize data in a hierarchical directory structure based on the values of one or more columns.
 2128        This can improve query performance when working with large datasets
 2129        :type parquet_partitions: list | None
 2130        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
 2131        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
 2132        multiple files. It helps in optimizing the export process by breaking down the data into
 2133        manageable chunks for processing and storage
 2134        :type chunk_size: int | None
 2135        :param threads: The `threads` parameter in the `export_output` function specifies the number of
 2136        threads to be used during the export process. It determines the level of parallelism and can
 2137        improve the performance of the export operation. If this parameter is not provided, the function
 2138        will use the default number of threads
 2139        :type threads: int | None
 2140        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
 2141        determines whether the output file should be sorted based on genomic coordinates of the
 2142        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
 2143        `False`,, defaults to False
 2144        :type sort: bool (optional)
 2145        :param index: The `index` parameter in the `export_output` function is a boolean flag that
 2146        determines whether an index should be created on the output file. If `index` is set to `True`,
 2147        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
 2148        :type index: bool (optional)
 2149        :param order_by: The `order_by` parameter in the `export_output` function is a string that
 2150        specifies the column(s) to use for sorting the output file. This parameter is only applicable
 2151        when exporting data in VCF format. It allows you to specify the column(s) based on which the
 2152        output file should be
 2153        :type order_by: str | None
 2154        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
 2155        mapping of field names to be renamed during the export process. This parameter allows you to
 2156        customize the output field names before exporting the data. Each key-value pair in the
 2157        dictionary represents the original field name as the key and the new field name
 2158        :type fields_to_rename: dict | None
 2159        :return: The `export_output` function returns a boolean value. It checks if the output file
 2160        exists and returns True if it does, or None if it doesn't.
 2161        """
 2162
 2163        # Log
 2164        log.info("Exporting...")
 2165
 2166        # Full path
 2167        output_file = full_path(output_file)
 2168        output_header = full_path(output_header)
 2169
 2170        # Config
 2171        config = self.get_config()
 2172
 2173        # Param
 2174        param = self.get_param()
 2175
 2176        # Tmp files to remove
 2177        tmp_to_remove = []
 2178
 2179        # If no output, get it
 2180        if not output_file:
 2181            output_file = self.get_output()
 2182
 2183        # If not threads
 2184        if not threads:
 2185            threads = self.get_threads()
 2186
 2187        # Rename fields
 2188        if not fields_to_rename:
 2189            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
 2190        self.rename_info_fields(fields_to_rename=fields_to_rename)
 2191
 2192        # Auto header name with extension
 2193        if export_header or output_header:
 2194            if not output_header:
 2195                output_header = f"{output_file}.hdr"
 2196            # Export header
 2197            self.export_header(output_file=output_file)
 2198
 2199        # Switch off export header if VCF output
 2200        output_file_type = get_file_format(output_file)
 2201        if output_file_type in ["vcf"]:
 2202            export_header = False
 2203            tmp_to_remove.append(output_header)
 2204
 2205        # Chunk size
 2206        if not chunk_size:
 2207            chunk_size = config.get("chunk_size", None)
 2208
 2209        # Parquet partition
 2210        if not parquet_partitions:
 2211            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2212        if parquet_partitions and isinstance(parquet_partitions, str):
 2213            parquet_partitions = parquet_partitions.split(",")
 2214
 2215        # Order by
 2216        if not order_by:
 2217            order_by = param.get("export", {}).get("order_by", "")
 2218
 2219        # Header in output
 2220        header_in_output = param.get("export", {}).get("include_header", False)
 2221
 2222        # Database
 2223        database_source = self.get_connexion()
 2224
 2225        # Connexion format
 2226        connexion_format = self.get_connexion_format()
 2227
 2228        # Explode infos
 2229        if self.get_explode_infos():
 2230            self.explode_infos(
 2231                prefix=self.get_explode_infos_prefix(),
 2232                fields=self.get_explode_infos_fields(),
 2233                force=False,
 2234            )
 2235
 2236        # if connexion_format in ["sqlite"] or query:
 2237        if connexion_format in ["sqlite"]:
 2238
 2239            # Export in Parquet
 2240            random_tmp = "".join(
 2241                random.choice(string.ascii_lowercase) for i in range(10)
 2242            )
 2243            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2244            tmp_to_remove.append(database_source)
 2245
 2246            # Table Variants
 2247            table_variants = self.get_table_variants()
 2248
 2249            # Create export query
 2250            sql_query_export_subquery = f"""
 2251                SELECT * FROM {table_variants}
 2252                """
 2253
 2254            # Write source file
 2255            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2256
 2257        # Create database
 2258        database = Database(
 2259            database=database_source,
 2260            table="variants",
 2261            header_file=output_header,
 2262            conn_config=self.get_connexion_config(),
 2263        )
 2264
 2265        # Existing colomns header
 2266        existing_columns_header = database.get_header_columns_from_database(query=query)
 2267
 2268        # Sample list
 2269        if output_file_type in ["vcf"]:
 2270            get_samples = self.get_samples()
 2271            get_samples_check = self.get_samples_check()
 2272            samples_force = get_samples is not None
 2273            sample_list = self.get_header_sample_list(
 2274                check=get_samples_check,
 2275                samples=get_samples,
 2276                samples_force=samples_force,
 2277            )
 2278        else:
 2279            sample_list = None
 2280
 2281        # Export file
 2282        database.export(
 2283            output_database=output_file,
 2284            output_header=output_header,
 2285            existing_columns_header=existing_columns_header,
 2286            parquet_partitions=parquet_partitions,
 2287            chunk_size=chunk_size,
 2288            threads=threads,
 2289            sort=sort,
 2290            index=index,
 2291            header_in_output=header_in_output,
 2292            order_by=order_by,
 2293            query=query,
 2294            export_header=export_header,
 2295            sample_list=sample_list,
 2296        )
 2297
 2298        # Remove
 2299        remove_if_exists(tmp_to_remove)
 2300
 2301        return (os.path.exists(output_file) or None) and (
 2302            os.path.exists(output_file) or None
 2303        )
 2304
 2305    def get_extra_infos(self, table: str = None) -> list:
 2306        """
 2307        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2308        in the header.
 2309
 2310        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2311        name of the table from which you want to retrieve the extra columns that are not present in the
 2312        header. If the `table` parameter is not provided when calling the function, it will default to
 2313        using the variants
 2314        :type table: str
 2315        :return: A list of columns that are in the specified table but not in the header of the table.
 2316        """
 2317
 2318        header_columns = []
 2319
 2320        if not table:
 2321            table = self.get_table_variants(clause="from")
 2322            header_columns = self.get_header_columns()
 2323
 2324        # Check all columns in the database
 2325        query = f""" SELECT * FROM {table} LIMIT 1 """
 2326        log.debug(f"query {query}")
 2327        table_columns = self.get_query_to_df(query).columns.tolist()
 2328        extra_columns = []
 2329
 2330        # Construct extra infos (not in header)
 2331        for column in table_columns:
 2332            if column not in header_columns:
 2333                extra_columns.append(column)
 2334
 2335        return extra_columns
 2336
 2337    def get_extra_infos_sql(self, table: str = None) -> str:
 2338        """
 2339        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2340        by double quotes
 2341
 2342        :param table: The name of the table to get the extra infos from. If None, the default table is
 2343        used
 2344        :type table: str
 2345        :return: A string of the extra infos
 2346        """
 2347
 2348        return ", ".join(
 2349            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2350        )
 2351
 2352    def export_header(
 2353        self,
 2354        header_name: str = None,
 2355        output_file: str = None,
 2356        output_file_ext: str = ".hdr",
 2357        clean_header: bool = True,
 2358        remove_chrom_line: bool = False,
 2359    ) -> str:
 2360        """
 2361        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2362        specified options, and writes it to a new file.
 2363
 2364        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2365        this parameter is not specified, the header will be written to the output file
 2366        :type header_name: str
 2367        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2368        specify the name of the output file where the header will be written. If this parameter is not
 2369        provided, the header will be written to a temporary file
 2370        :type output_file: str
 2371        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2372        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2373        if not specified by the user. This extension will be appended to the `output_file` name to
 2374        create the final, defaults to .hdr
 2375        :type output_file_ext: str (optional)
 2376        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2377        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2378        `True`, the function will clean the header by modifying certain lines based on a specific
 2379        pattern. If `clean_header`, defaults to True
 2380        :type clean_header: bool (optional)
 2381        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2382        boolean flag that determines whether the #CHROM line should be removed from the header before
 2383        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2384        defaults to False
 2385        :type remove_chrom_line: bool (optional)
 2386        :return: The function `export_header` returns the name of the temporary header file that is
 2387        created.
 2388        """
 2389
 2390        if not header_name and not output_file:
 2391            output_file = self.get_output()
 2392
 2393        if self.get_header():
 2394
 2395            # Get header object
 2396            header_obj = self.get_header()
 2397
 2398            # Create database
 2399            db_for_header = Database(database=self.get_input())
 2400
 2401            # Get real columns in the file
 2402            db_header_columns = db_for_header.get_columns()
 2403
 2404            with tempfile.TemporaryDirectory() as tmpdir:
 2405
 2406                # Write header file
 2407                header_file_tmp = os.path.join(tmpdir, "header")
 2408                f = open(header_file_tmp, "w")
 2409                vcf.Writer(f, header_obj)
 2410                f.close()
 2411
 2412                # Replace #CHROM line with rel columns
 2413                header_list = db_for_header.read_header_file(
 2414                    header_file=header_file_tmp
 2415                )
 2416                header_list[-1] = "\t".join(db_header_columns)
 2417
 2418                # Remove CHROM line
 2419                if remove_chrom_line:
 2420                    header_list.pop()
 2421
 2422                # Clean header
 2423                if clean_header:
 2424                    header_list_clean = []
 2425                    for head in header_list:
 2426                        # Clean head for malformed header
 2427                        head_clean = head
 2428                        head_clean = re.subn(
 2429                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2430                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2431                            head_clean,
 2432                            2,
 2433                        )[0]
 2434                        # Write header
 2435                        header_list_clean.append(head_clean)
 2436                    header_list = header_list_clean
 2437
 2438            tmp_header_name = output_file + output_file_ext
 2439
 2440            f = open(tmp_header_name, "w")
 2441            for line in header_list:
 2442                f.write(line)
 2443            f.close()
 2444
 2445        return tmp_header_name
 2446
 2447    def export_variant_vcf(
 2448        self,
 2449        vcf_file,
 2450        remove_info: bool = False,
 2451        add_samples: bool = True,
 2452        list_samples: list = [],
 2453        where_clause: str = "",
 2454        index: bool = False,
 2455        threads: int | None = None,
 2456    ) -> bool | None:
 2457        """
 2458        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2459        remove INFO field, add samples, and control compression and indexing.
 2460
 2461        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2462        written to. It is the output file that will contain the filtered VCF data based on the specified
 2463        parameters
 2464        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2465        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2466        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2467        in, defaults to False
 2468        :type remove_info: bool (optional)
 2469        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2470        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2471        If set to False, the samples will be removed. The default value is True, defaults to True
 2472        :type add_samples: bool (optional)
 2473        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2474        in the output VCF file. By default, all samples will be included. If you provide a list of
 2475        samples, only those samples will be included in the output file
 2476        :type list_samples: list
 2477        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2478        determines whether or not to create an index for the output VCF file. If `index` is set to
 2479        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2480        :type index: bool (optional)
 2481        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2482        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2483        will be used during the export process. More threads can potentially speed up the export process
 2484        by utilizing multiple cores of the processor. If
 2485        :type threads: int | None
 2486        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2487        method with various parameters including the output file, query, threads, sort flag, and index
 2488        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2489        specified parameters and configurations provided in the `export_variant_vcf` function.
 2490        """
 2491
 2492        # Config
 2493        config = self.get_config()
 2494
 2495        # Extract VCF
 2496        log.debug("Export VCF...")
 2497
 2498        # Table variants
 2499        table_variants = self.get_table_variants()
 2500
 2501        # Threads
 2502        if not threads:
 2503            threads = self.get_threads()
 2504
 2505        # Info fields
 2506        if remove_info:
 2507            if not isinstance(remove_info, str):
 2508                remove_info = "."
 2509            info_field = f"""'{remove_info}' as INFO"""
 2510        else:
 2511            info_field = "INFO"
 2512
 2513        # Samples fields
 2514        if add_samples:
 2515            if not list_samples:
 2516                list_samples = self.get_header_sample_list()
 2517            if list_samples:
 2518                samples_fields = " , FORMAT , " + " , ".join(
 2519                    [f""" "{sample}" """ for sample in list_samples]
 2520                )
 2521            else:
 2522                samples_fields = ""
 2523            log.debug(f"samples_fields: {samples_fields}")
 2524        else:
 2525            samples_fields = ""
 2526
 2527        # Where clause
 2528        if where_clause is None:
 2529            where_clause = ""
 2530
 2531        # Variants
 2532        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2533        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2534        log.debug(f"sql_query_select={sql_query_select}")
 2535
 2536        return self.export_output(
 2537            output_file=vcf_file,
 2538            output_header=None,
 2539            export_header=True,
 2540            query=sql_query_select,
 2541            parquet_partitions=None,
 2542            chunk_size=config.get("chunk_size", None),
 2543            threads=threads,
 2544            sort=True,
 2545            index=index,
 2546            order_by=None,
 2547        )
 2548
 2549    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2550        """
 2551        It takes a list of commands and runs them in parallel using the number of threads specified
 2552
 2553        :param commands: A list of commands to run
 2554        :param threads: The number of threads to use, defaults to 1 (optional)
 2555        """
 2556
 2557        run_parallel_commands(commands, threads)
 2558
 2559    def get_threads(self, default: int = 1) -> int:
 2560        """
 2561        This function returns the number of threads to use for a job, with a default value of 1 if not
 2562        specified.
 2563
 2564        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2565        default number of threads to use if no specific value is provided. If no value is provided for
 2566        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2567        used, defaults to 1
 2568        :type default: int (optional)
 2569        :return: the number of threads to use for the current job.
 2570        """
 2571
 2572        # Config
 2573        config = self.get_config()
 2574
 2575        # Param
 2576        param = self.get_param()
 2577
 2578        # Input threads
 2579        input_thread = param.get("threads", config.get("threads", None))
 2580
 2581        # Check threads
 2582        if not input_thread:
 2583            threads = default
 2584        elif int(input_thread) <= 0:
 2585            threads = os.cpu_count()
 2586        else:
 2587            threads = int(input_thread)
 2588        return threads
 2589
 2590    def get_memory(self, default: str = None) -> str:
 2591        """
 2592        This function retrieves the memory value from parameters or configuration with a default value
 2593        if not found.
 2594
 2595        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2596        default value is used as a fallback in case the `memory` parameter is not provided in the
 2597        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2598        the function
 2599        :type default: str
 2600        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2601        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2602        return the default value provided as an argument to the function.
 2603        """
 2604
 2605        # Config
 2606        config = self.get_config()
 2607
 2608        # Param
 2609        param = self.get_param()
 2610
 2611        # Input threads
 2612        input_memory = param.get("memory", config.get("memory", None))
 2613
 2614        # Check threads
 2615        if input_memory:
 2616            memory = input_memory
 2617        else:
 2618            memory = default
 2619
 2620        return memory
 2621
 2622    def update_from_vcf(self, vcf_file: str) -> None:
 2623        """
 2624        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2625
 2626        :param vcf_file: the path to the VCF file
 2627        """
 2628
 2629        connexion_format = self.get_connexion_format()
 2630
 2631        if connexion_format in ["duckdb"]:
 2632            self.update_from_vcf_duckdb(vcf_file)
 2633        elif connexion_format in ["sqlite"]:
 2634            self.update_from_vcf_sqlite(vcf_file)
 2635
 2636    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2637        """
 2638        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2639        INFO column of the VCF file
 2640
 2641        :param vcf_file: the path to the VCF file
 2642        """
 2643
 2644        # varaints table
 2645        table_variants = self.get_table_variants()
 2646
 2647        # Loading VCF into temporaire table
 2648        skip = self.get_header_length(file=vcf_file)
 2649        vcf_df = pd.read_csv(
 2650            vcf_file,
 2651            sep="\t",
 2652            engine="c",
 2653            skiprows=skip,
 2654            header=0,
 2655            low_memory=False,
 2656        )
 2657        sql_query_update = f"""
 2658        UPDATE {table_variants} as table_variants
 2659            SET INFO = concat(
 2660                            CASE
 2661                                WHEN INFO NOT IN ('', '.')
 2662                                THEN INFO
 2663                                ELSE ''
 2664                            END,
 2665                            (
 2666                                SELECT 
 2667                                    concat(
 2668                                        CASE
 2669                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2670                                            THEN ';'
 2671                                            ELSE ''
 2672                                        END
 2673                                        ,
 2674                                        CASE
 2675                                            WHEN table_parquet.INFO NOT IN ('','.')
 2676                                            THEN table_parquet.INFO
 2677                                            ELSE ''
 2678                                        END
 2679                                    )
 2680                                FROM vcf_df as table_parquet
 2681                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2682                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2683                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2684                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2685                                        AND table_parquet.INFO NOT IN ('','.')
 2686                            )
 2687                        )
 2688            ;
 2689            """
 2690        self.conn.execute(sql_query_update)
 2691
 2692    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2693        """
 2694        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2695        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2696        table
 2697
 2698        :param vcf_file: The path to the VCF file you want to update the database with
 2699        """
 2700
 2701        # Create a temporary table for the VCF
 2702        table_vcf = "tmp_vcf"
 2703        sql_create = (
 2704            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2705        )
 2706        self.conn.execute(sql_create)
 2707
 2708        # Loading VCF into temporaire table
 2709        vcf_df = pd.read_csv(
 2710            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2711        )
 2712        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2713        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2714
 2715        # Update table 'variants' with VCF data
 2716        # warning: CONCAT as || operator
 2717        sql_query_update = f"""
 2718            UPDATE variants as table_variants
 2719            SET INFO = CASE
 2720                            WHEN INFO NOT IN ('', '.')
 2721                            THEN INFO
 2722                            ELSE ''
 2723                        END ||
 2724                        (
 2725                        SELECT 
 2726                            CASE 
 2727                                WHEN table_variants.INFO NOT IN ('','.') 
 2728                                    AND table_vcf.INFO NOT IN ('','.')  
 2729                                THEN ';' 
 2730                                ELSE '' 
 2731                            END || 
 2732                            CASE 
 2733                                WHEN table_vcf.INFO NOT IN ('','.') 
 2734                                THEN table_vcf.INFO 
 2735                                ELSE '' 
 2736                            END
 2737                        FROM {table_vcf} as table_vcf
 2738                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2739                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2740                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2741                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2742                        )
 2743        """
 2744        self.conn.execute(sql_query_update)
 2745
 2746        # Drop temporary table
 2747        sql_drop = f"DROP TABLE {table_vcf}"
 2748        self.conn.execute(sql_drop)
 2749
 2750    def drop_variants_table(self) -> None:
 2751        """
 2752        > This function drops the variants table
 2753        """
 2754
 2755        table_variants = self.get_table_variants()
 2756        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2757        self.conn.execute(sql_table_variants)
 2758
 2759    def set_variant_id(
 2760        self, variant_id_column: str = "variant_id", force: bool = None
 2761    ) -> str:
 2762        """
 2763        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2764        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2765
 2766        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2767        to variant_id
 2768        :type variant_id_column: str (optional)
 2769        :param force: If True, the variant_id column will be created even if it already exists
 2770        :type force: bool
 2771        :return: The name of the column that contains the variant_id
 2772        """
 2773
 2774        # Assembly
 2775        assembly = self.get_param().get(
 2776            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2777        )
 2778
 2779        # INFO/Tag prefix
 2780        prefix = self.get_explode_infos_prefix()
 2781
 2782        # Explode INFO/SVTYPE
 2783        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2784
 2785        # variants table
 2786        table_variants = self.get_table_variants()
 2787
 2788        # variant_id column
 2789        if not variant_id_column:
 2790            variant_id_column = "variant_id"
 2791
 2792        # Creta variant_id column
 2793        if "variant_id" not in self.get_extra_infos() or force:
 2794
 2795            # Create column
 2796            self.add_column(
 2797                table_name=table_variants,
 2798                column_name=variant_id_column,
 2799                column_type="UBIGINT",
 2800                default_value="0",
 2801            )
 2802
 2803            # Update column
 2804            self.conn.execute(
 2805                f"""
 2806                    UPDATE {table_variants}
 2807                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2808                """
 2809            )
 2810
 2811        # Remove added columns
 2812        for added_column in added_columns:
 2813            self.drop_column(column=added_column)
 2814
 2815        # return variant_id column name
 2816        return variant_id_column
 2817
 2818    def get_variant_id_column(
 2819        self, variant_id_column: str = "variant_id", force: bool = None
 2820    ) -> str:
 2821        """
 2822        This function returns the variant_id column name
 2823
 2824        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2825        defaults to variant_id
 2826        :type variant_id_column: str (optional)
 2827        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2828        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2829        if it is not already set, or if it is set
 2830        :type force: bool
 2831        :return: The variant_id column name.
 2832        """
 2833
 2834        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2835
 2836    ###
 2837    # Annotation
 2838    ###
 2839
 2840    def scan_databases(
 2841        self,
 2842        database_formats: list = ["parquet"],
 2843        database_releases: list = ["current"],
 2844    ) -> dict:
 2845        """
 2846        The function `scan_databases` scans for available databases based on specified formats and
 2847        releases.
 2848
 2849        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2850        of the databases to be scanned. In this case, the accepted format is "parquet"
 2851        :type database_formats: list ["parquet"]
 2852        :param database_releases: The `database_releases` parameter is a list that specifies the
 2853        releases of the databases to be scanned. In the provided function, the default value for
 2854        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2855        databases that are in the "current"
 2856        :type database_releases: list
 2857        :return: The function `scan_databases` returns a dictionary containing information about
 2858        databases that match the specified formats and releases.
 2859        """
 2860
 2861        # Config
 2862        config = self.get_config()
 2863
 2864        # Param
 2865        param = self.get_param()
 2866
 2867        # Param - Assembly
 2868        assembly = param.get("assembly", config.get("assembly", None))
 2869        if not assembly:
 2870            assembly = DEFAULT_ASSEMBLY
 2871            log.warning(f"Default assembly '{assembly}'")
 2872
 2873        # Scan for availabled databases
 2874        log.info(
 2875            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2876        )
 2877        databases_infos_dict = databases_infos(
 2878            database_folder_releases=database_releases,
 2879            database_formats=database_formats,
 2880            assembly=assembly,
 2881            config=config,
 2882        )
 2883        log.info(
 2884            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2885        )
 2886
 2887        return databases_infos_dict
 2888
 2889    def annotation(self) -> None:
 2890        """
 2891        It annotates the VCF file with the annotations specified in the config file.
 2892        """
 2893
 2894        # Config
 2895        config = self.get_config()
 2896
 2897        # Param
 2898        param = self.get_param()
 2899
 2900        # Param - Assembly
 2901        assembly = param.get("assembly", config.get("assembly", None))
 2902        if not assembly:
 2903            assembly = DEFAULT_ASSEMBLY
 2904            log.warning(f"Default assembly '{assembly}'")
 2905
 2906        # annotations databases folders
 2907        annotations_databases = set(
 2908            config.get("folders", {})
 2909            .get("databases", {})
 2910            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2911            + config.get("folders", {})
 2912            .get("databases", {})
 2913            .get("parquet", ["~/howard/databases/parquet/current"])
 2914            + config.get("folders", {})
 2915            .get("databases", {})
 2916            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2917        )
 2918
 2919        # Get param annotations
 2920        if param.get("annotations", None) and isinstance(
 2921            param.get("annotations", None), str
 2922        ):
 2923            log.debug(param.get("annotations", None))
 2924            param_annotation_list = param.get("annotations").split(",")
 2925        else:
 2926            param_annotation_list = []
 2927
 2928        # Each tools param
 2929        if param.get("annotation_parquet", None) != None:
 2930            log.debug(
 2931                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2932            )
 2933            if isinstance(param.get("annotation_parquet", None), list):
 2934                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2935            else:
 2936                param_annotation_list.append(param.get("annotation_parquet"))
 2937        if param.get("annotation_snpsift", None) != None:
 2938            if isinstance(param.get("annotation_snpsift", None), list):
 2939                param_annotation_list.append(
 2940                    "snpsift:"
 2941                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2942                )
 2943            else:
 2944                param_annotation_list.append(
 2945                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2946                )
 2947        if param.get("annotation_snpeff", None) != None:
 2948            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2949        if param.get("annotation_bcftools", None) != None:
 2950            if isinstance(param.get("annotation_bcftools", None), list):
 2951                param_annotation_list.append(
 2952                    "bcftools:"
 2953                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2954                )
 2955            else:
 2956                param_annotation_list.append(
 2957                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2958                )
 2959        if param.get("annotation_annovar", None) != None:
 2960            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2961        if param.get("annotation_exomiser", None) != None:
 2962            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2963        if param.get("annotation_splice", None) != None:
 2964            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2965
 2966        # Merge param annotations list
 2967        param["annotations"] = ",".join(param_annotation_list)
 2968
 2969        # debug
 2970        log.debug(f"param_annotations={param['annotations']}")
 2971
 2972        if param.get("annotations"):
 2973
 2974            # Log
 2975            # log.info("Annotations - Check annotation parameters")
 2976
 2977            if not "annotation" in param:
 2978                param["annotation"] = {}
 2979
 2980            # List of annotations parameters
 2981            annotations_list_input = {}
 2982            if isinstance(param.get("annotations", None), str):
 2983                annotation_file_list = [
 2984                    value for value in param.get("annotations", "").split(",")
 2985                ]
 2986                for annotation_file in annotation_file_list:
 2987                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2988            else:
 2989                annotations_list_input = param.get("annotations", {})
 2990
 2991            log.info(f"Quick Annotations:")
 2992            for annotation_key in list(annotations_list_input.keys()):
 2993                log.info(f"   {annotation_key}")
 2994
 2995            # List of annotations and associated fields
 2996            annotations_list = {}
 2997
 2998            for annotation_file in annotations_list_input:
 2999
 3000                # Explode annotations if ALL
 3001                if (
 3002                    annotation_file.upper() == "ALL"
 3003                    or annotation_file.upper().startswith("ALL:")
 3004                ):
 3005
 3006                    # check ALL parameters (formats, releases)
 3007                    annotation_file_split = annotation_file.split(":")
 3008                    database_formats = "parquet"
 3009                    database_releases = "current"
 3010                    for annotation_file_option in annotation_file_split[1:]:
 3011                        database_all_options_split = annotation_file_option.split("=")
 3012                        if database_all_options_split[0] == "format":
 3013                            database_formats = database_all_options_split[1].split("+")
 3014                        if database_all_options_split[0] == "release":
 3015                            database_releases = database_all_options_split[1].split("+")
 3016
 3017                    # Scan for availabled databases
 3018                    databases_infos_dict = self.scan_databases(
 3019                        database_formats=database_formats,
 3020                        database_releases=database_releases,
 3021                    )
 3022
 3023                    # Add found databases in annotation parameters
 3024                    for database_infos in databases_infos_dict.keys():
 3025                        annotations_list[database_infos] = {"INFO": None}
 3026
 3027                else:
 3028                    annotations_list[annotation_file] = annotations_list_input[
 3029                        annotation_file
 3030                    ]
 3031
 3032            # Check each databases
 3033            if len(annotations_list):
 3034
 3035                log.info(
 3036                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3037                )
 3038
 3039                for annotation_file in annotations_list:
 3040
 3041                    # Init
 3042                    annotations = annotations_list.get(annotation_file, None)
 3043
 3044                    # Annotation snpEff
 3045                    if annotation_file.startswith("snpeff"):
 3046
 3047                        log.debug(f"Quick Annotation snpEff")
 3048
 3049                        if "snpeff" not in param["annotation"]:
 3050                            param["annotation"]["snpeff"] = {}
 3051
 3052                        if "options" not in param["annotation"]["snpeff"]:
 3053                            param["annotation"]["snpeff"]["options"] = ""
 3054
 3055                        # snpEff options in annotations
 3056                        param["annotation"]["snpeff"]["options"] = "".join(
 3057                            annotation_file.split(":")[1:]
 3058                        )
 3059
 3060                    # Annotation Annovar
 3061                    elif annotation_file.startswith("annovar"):
 3062
 3063                        log.debug(f"Quick Annotation Annovar")
 3064
 3065                        if "annovar" not in param["annotation"]:
 3066                            param["annotation"]["annovar"] = {}
 3067
 3068                        if "annotations" not in param["annotation"]["annovar"]:
 3069                            param["annotation"]["annovar"]["annotations"] = {}
 3070
 3071                        # Options
 3072                        annotation_file_split = annotation_file.split(":")
 3073                        for annotation_file_annotation in annotation_file_split[1:]:
 3074                            if annotation_file_annotation:
 3075                                param["annotation"]["annovar"]["annotations"][
 3076                                    annotation_file_annotation
 3077                                ] = annotations
 3078
 3079                    # Annotation Exomiser
 3080                    elif annotation_file.startswith("exomiser"):
 3081
 3082                        log.debug(f"Quick Annotation Exomiser")
 3083
 3084                        param["annotation"]["exomiser"] = params_string_to_dict(
 3085                            annotation_file
 3086                        )
 3087
 3088                    # Annotation Splice
 3089                    elif annotation_file.startswith("splice"):
 3090
 3091                        log.debug(f"Quick Annotation Splice")
 3092
 3093                        param["annotation"]["splice"] = params_string_to_dict(
 3094                            annotation_file
 3095                        )
 3096
 3097                    # Annotation Parquet or BCFTOOLS
 3098                    else:
 3099
 3100                        # Tools detection
 3101                        if annotation_file.startswith("bcftools:"):
 3102                            annotation_tool_initial = "bcftools"
 3103                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3104                        elif annotation_file.startswith("snpsift:"):
 3105                            annotation_tool_initial = "snpsift"
 3106                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3107                        elif annotation_file.startswith("bigwig:"):
 3108                            annotation_tool_initial = "bigwig"
 3109                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3110                        else:
 3111                            annotation_tool_initial = None
 3112
 3113                        # list of files
 3114                        annotation_file_list = annotation_file.replace("+", ":").split(
 3115                            ":"
 3116                        )
 3117
 3118                        for annotation_file in annotation_file_list:
 3119
 3120                            if annotation_file:
 3121
 3122                                # Annotation tool initial
 3123                                annotation_tool = annotation_tool_initial
 3124
 3125                                # Find file
 3126                                annotation_file_found = None
 3127
 3128                                if os.path.exists(annotation_file):
 3129                                    annotation_file_found = annotation_file
 3130                                elif os.path.exists(full_path(annotation_file)):
 3131                                    annotation_file_found = full_path(annotation_file)
 3132                                else:
 3133                                    # Find within assembly folders
 3134                                    for annotations_database in annotations_databases:
 3135                                        found_files = find_all(
 3136                                            annotation_file,
 3137                                            os.path.join(
 3138                                                annotations_database, assembly
 3139                                            ),
 3140                                        )
 3141                                        if len(found_files) > 0:
 3142                                            annotation_file_found = found_files[0]
 3143                                            break
 3144                                    if not annotation_file_found and not assembly:
 3145                                        # Find within folders
 3146                                        for (
 3147                                            annotations_database
 3148                                        ) in annotations_databases:
 3149                                            found_files = find_all(
 3150                                                annotation_file, annotations_database
 3151                                            )
 3152                                            if len(found_files) > 0:
 3153                                                annotation_file_found = found_files[0]
 3154                                                break
 3155                                log.debug(
 3156                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3157                                )
 3158
 3159                                # Full path
 3160                                annotation_file_found = full_path(annotation_file_found)
 3161
 3162                                if annotation_file_found:
 3163
 3164                                    database = Database(database=annotation_file_found)
 3165                                    quick_annotation_format = database.get_format()
 3166                                    quick_annotation_is_compressed = (
 3167                                        database.is_compressed()
 3168                                    )
 3169                                    quick_annotation_is_indexed = os.path.exists(
 3170                                        f"{annotation_file_found}.tbi"
 3171                                    )
 3172                                    bcftools_preference = False
 3173
 3174                                    # Check Annotation Tool
 3175                                    if not annotation_tool:
 3176                                        if (
 3177                                            bcftools_preference
 3178                                            and quick_annotation_format
 3179                                            in ["vcf", "bed"]
 3180                                            and quick_annotation_is_compressed
 3181                                            and quick_annotation_is_indexed
 3182                                        ):
 3183                                            annotation_tool = "bcftools"
 3184                                        elif quick_annotation_format in [
 3185                                            "vcf",
 3186                                            "bed",
 3187                                            "tsv",
 3188                                            "tsv",
 3189                                            "csv",
 3190                                            "json",
 3191                                            "tbl",
 3192                                            "parquet",
 3193                                            "duckdb",
 3194                                        ]:
 3195                                            annotation_tool = "parquet"
 3196                                        elif quick_annotation_format in ["bw"]:
 3197                                            annotation_tool = "bigwig"
 3198                                        else:
 3199                                            log.error(
 3200                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3201                                            )
 3202                                            raise ValueError(
 3203                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3204                                            )
 3205
 3206                                    log.debug(
 3207                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3208                                    )
 3209
 3210                                    # Annotation Tool dispatch
 3211                                    if annotation_tool:
 3212                                        if annotation_tool not in param["annotation"]:
 3213                                            param["annotation"][annotation_tool] = {}
 3214                                        if (
 3215                                            "annotations"
 3216                                            not in param["annotation"][annotation_tool]
 3217                                        ):
 3218                                            param["annotation"][annotation_tool][
 3219                                                "annotations"
 3220                                            ] = {}
 3221                                        param["annotation"][annotation_tool][
 3222                                            "annotations"
 3223                                        ][annotation_file_found] = annotations
 3224
 3225                                else:
 3226                                    log.warning(
 3227                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3228                                    )
 3229
 3230                self.set_param(param)
 3231
 3232        if param.get("annotation", None):
 3233            log.info("Annotations")
 3234            if param.get("annotation", {}).get("parquet", None):
 3235                log.info("Annotations 'parquet'...")
 3236                self.annotation_parquet()
 3237            if param.get("annotation", {}).get("bcftools", None):
 3238                log.info("Annotations 'bcftools'...")
 3239                self.annotation_bcftools()
 3240            if param.get("annotation", {}).get("snpsift", None):
 3241                log.info("Annotations 'snpsift'...")
 3242                self.annotation_snpsift()
 3243            if param.get("annotation", {}).get("bigwig", None):
 3244                log.info("Annotations 'bigwig'...")
 3245                self.annotation_bigwig()
 3246            if param.get("annotation", {}).get("annovar", None):
 3247                log.info("Annotations 'annovar'...")
 3248                self.annotation_annovar()
 3249            if param.get("annotation", {}).get("snpeff", None):
 3250                log.info("Annotations 'snpeff'...")
 3251                self.annotation_snpeff()
 3252            if param.get("annotation", {}).get("exomiser", None) is not None:
 3253                log.info("Annotations 'exomiser'...")
 3254                self.annotation_exomiser()
 3255            if param.get("annotation", {}).get("splice", None) is not None:
 3256                log.info("Annotations 'splice' ...")
 3257                self.annotation_splice()
 3258
 3259        # Explode INFOS fields into table fields
 3260        if self.get_explode_infos():
 3261            self.explode_infos(
 3262                prefix=self.get_explode_infos_prefix(),
 3263                fields=self.get_explode_infos_fields(),
 3264                force=True,
 3265            )
 3266
 3267    def annotation_bigwig(self, threads: int = None) -> None:
 3268        """
 3269        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3270
 3271        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3272        number of threads to be used for parallel processing during the annotation process. If the
 3273        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3274        threads to use based on the system configuration
 3275        :type threads: int
 3276        :return: True
 3277        """
 3278
 3279        # DEBUG
 3280        log.debug("Start annotation with bigwig databases")
 3281
 3282        # # Threads
 3283        # if not threads:
 3284        #     threads = self.get_threads()
 3285        # log.debug("Threads: " + str(threads))
 3286
 3287        # Config
 3288        config = self.get_config()
 3289        log.debug("Config: " + str(config))
 3290
 3291        # Config - BCFTools databases folders
 3292        databases_folders = set(
 3293            self.get_config()
 3294            .get("folders", {})
 3295            .get("databases", {})
 3296            .get("annotations", ["."])
 3297            + self.get_config()
 3298            .get("folders", {})
 3299            .get("databases", {})
 3300            .get("bigwig", ["."])
 3301        )
 3302        log.debug("Databases annotations: " + str(databases_folders))
 3303
 3304        # Param
 3305        annotations = (
 3306            self.get_param()
 3307            .get("annotation", {})
 3308            .get("bigwig", {})
 3309            .get("annotations", None)
 3310        )
 3311        log.debug("Annotations: " + str(annotations))
 3312
 3313        # Assembly
 3314        assembly = self.get_param().get(
 3315            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3316        )
 3317
 3318        # Data
 3319        table_variants = self.get_table_variants()
 3320
 3321        # Check if not empty
 3322        log.debug("Check if not empty")
 3323        sql_query_chromosomes = (
 3324            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3325        )
 3326        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3327        if not sql_query_chromosomes_df["count"][0]:
 3328            log.info(f"VCF empty")
 3329            return
 3330
 3331        # VCF header
 3332        vcf_reader = self.get_header()
 3333        log.debug("Initial header: " + str(vcf_reader.infos))
 3334
 3335        # Existing annotations
 3336        for vcf_annotation in self.get_header().infos:
 3337
 3338            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3339            log.debug(
 3340                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3341            )
 3342
 3343        if annotations:
 3344
 3345            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3346
 3347                # Export VCF file
 3348                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3349
 3350                # annotation_bigwig_config
 3351                annotation_bigwig_config_list = []
 3352
 3353                for annotation in annotations:
 3354                    annotation_fields = annotations[annotation]
 3355
 3356                    # Annotation Name
 3357                    annotation_name = os.path.basename(annotation)
 3358
 3359                    if not annotation_fields:
 3360                        annotation_fields = {"INFO": None}
 3361
 3362                    log.debug(f"Annotation '{annotation_name}'")
 3363                    log.debug(
 3364                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3365                    )
 3366
 3367                    # Create Database
 3368                    database = Database(
 3369                        database=annotation,
 3370                        databases_folders=databases_folders,
 3371                        assembly=assembly,
 3372                    )
 3373
 3374                    # Find files
 3375                    db_file = database.get_database()
 3376                    db_file = full_path(db_file)
 3377                    db_hdr_file = database.get_header_file()
 3378                    db_hdr_file = full_path(db_hdr_file)
 3379                    db_file_type = database.get_format()
 3380
 3381                    # If db_file is http ?
 3382                    if database.get_database().startswith("http"):
 3383
 3384                        # Datbase is HTTP URL
 3385                        db_file_is_http = True
 3386
 3387                        # DB file keep as URL
 3388                        db_file = database.get_database()
 3389                        log.warning(
 3390                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3391                        )
 3392
 3393                        # Retrieve automatic annotation field name
 3394                        annotation_field = clean_annotation_field(
 3395                            os.path.basename(db_file).replace(".bw", "")
 3396                        )
 3397                        log.debug(
 3398                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3399                        )
 3400
 3401                        # Create automatic header file
 3402                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3403                        with open(db_hdr_file, "w") as f:
 3404                            f.write("##fileformat=VCFv4.2\n")
 3405                            f.write(
 3406                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3407                            )
 3408                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3409
 3410                    else:
 3411
 3412                        # Datbase is NOT HTTP URL
 3413                        db_file_is_http = False
 3414
 3415                    # Check index - try to create if not exists
 3416                    if (
 3417                        db_file is None
 3418                        or db_hdr_file is None
 3419                        or (not os.path.exists(db_file) and not db_file_is_http)
 3420                        or not os.path.exists(db_hdr_file)
 3421                        or not db_file_type in ["bw"]
 3422                    ):
 3423                        # if False:
 3424                        log.error("Annotation failed: database not valid")
 3425                        log.error(f"Annotation annotation file: {db_file}")
 3426                        log.error(f"Annotation annotation file type: {db_file_type}")
 3427                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3428                        raise ValueError(
 3429                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3430                        )
 3431                    else:
 3432
 3433                        # Log
 3434                        log.debug(
 3435                            f"Annotation '{annotation}' - file: "
 3436                            + str(db_file)
 3437                            + " and "
 3438                            + str(db_hdr_file)
 3439                        )
 3440
 3441                        # Load header as VCF object
 3442                        db_hdr_vcf = Variants(input=db_hdr_file)
 3443                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3444                        log.debug(
 3445                            "Annotation database header: "
 3446                            + str(db_hdr_vcf_header_infos)
 3447                        )
 3448
 3449                        # For all fields in database
 3450                        annotation_fields_full = False
 3451                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3452                            annotation_fields = {
 3453                                key: key for key in db_hdr_vcf_header_infos
 3454                            }
 3455                            log.debug(
 3456                                "Annotation database header - All annotations added: "
 3457                                + str(annotation_fields)
 3458                            )
 3459                            annotation_fields_full = True
 3460
 3461                        # Init
 3462                        cyvcf2_header_rename_dict = {}
 3463                        cyvcf2_header_list = []
 3464                        cyvcf2_header_indexes = {}
 3465
 3466                        # process annotation fields
 3467                        for annotation_field in annotation_fields:
 3468
 3469                            # New annotation name
 3470                            annotation_field_new = annotation_fields[annotation_field]
 3471
 3472                            # Check annotation field and index in header
 3473                            if (
 3474                                annotation_field
 3475                                in db_hdr_vcf.get_header_columns_as_list()
 3476                            ):
 3477                                annotation_field_index = (
 3478                                    db_hdr_vcf.get_header_columns_as_list().index(
 3479                                        annotation_field
 3480                                    )
 3481                                    - 3
 3482                                )
 3483                                cyvcf2_header_indexes[annotation_field_new] = (
 3484                                    annotation_field_index
 3485                                )
 3486                            else:
 3487                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3488                                log.error(msg_err)
 3489                                raise ValueError(msg_err)
 3490
 3491                            # Append annotation field in cyvcf2 header list
 3492                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3493                                db_hdr_vcf_header_infos[annotation_field].id
 3494                            )
 3495                            cyvcf2_header_list.append(
 3496                                {
 3497                                    "ID": annotation_field_new,
 3498                                    "Number": db_hdr_vcf_header_infos[
 3499                                        annotation_field
 3500                                    ].num,
 3501                                    "Type": db_hdr_vcf_header_infos[
 3502                                        annotation_field
 3503                                    ].type,
 3504                                    "Description": db_hdr_vcf_header_infos[
 3505                                        annotation_field
 3506                                    ].desc,
 3507                                }
 3508                            )
 3509
 3510                            # Add header on VCF
 3511                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3512                                annotation_field_new,
 3513                                db_hdr_vcf_header_infos[annotation_field].num,
 3514                                db_hdr_vcf_header_infos[annotation_field].type,
 3515                                db_hdr_vcf_header_infos[annotation_field].desc,
 3516                                "HOWARD BigWig annotation",
 3517                                "unknown",
 3518                                self.code_type_map[
 3519                                    db_hdr_vcf_header_infos[annotation_field].type
 3520                                ],
 3521                            )
 3522
 3523                        # Load bigwig database
 3524                        bw_db = pyBigWig.open(db_file)
 3525                        if bw_db.isBigWig():
 3526                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3527                        else:
 3528                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3529                            log.error(msg_err)
 3530                            raise ValueError(msg_err)
 3531
 3532                        annotation_bigwig_config_list.append(
 3533                            {
 3534                                "db_file": db_file,
 3535                                "bw_db": bw_db,
 3536                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3537                                "cyvcf2_header_list": cyvcf2_header_list,
 3538                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3539                            }
 3540                        )
 3541
 3542                # Annotate
 3543                if annotation_bigwig_config_list:
 3544
 3545                    # Annotation config
 3546                    log.debug(
 3547                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3548                    )
 3549
 3550                    # Export VCF file
 3551                    self.export_variant_vcf(
 3552                        vcf_file=tmp_vcf_name,
 3553                        remove_info=True,
 3554                        add_samples=False,
 3555                        index=True,
 3556                    )
 3557
 3558                    # Load input tmp file
 3559                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3560
 3561                    # Add header in input file
 3562                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3563                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3564                            "cyvcf2_header_list", []
 3565                        ):
 3566                            log.info(
 3567                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3568                            )
 3569                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3570
 3571                    # Create output VCF file
 3572                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3573                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3574
 3575                    # Fetch variants
 3576                    log.info(f"Annotations 'bigwig' start...")
 3577                    for variant in input_vcf:
 3578
 3579                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3580
 3581                            # DB and indexes
 3582                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3583                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3584                                "cyvcf2_header_indexes", None
 3585                            )
 3586
 3587                            # Retrieve value from chrom pos
 3588                            res = bw_db.values(
 3589                                variant.CHROM, variant.POS - 1, variant.POS
 3590                            )
 3591
 3592                            # For each annotation fields (and indexes)
 3593                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3594
 3595                                # If value is NOT nNone
 3596                                if not np.isnan(
 3597                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3598                                ):
 3599                                    variant.INFO[cyvcf2_header_index] = res[
 3600                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3601                                    ]
 3602
 3603                        # Add record in output file
 3604                        output_vcf.write_record(variant)
 3605
 3606                    # Log
 3607                    log.debug(f"Annotation done.")
 3608
 3609                    # Close and write file
 3610                    log.info(f"Annotations 'bigwig' write...")
 3611                    output_vcf.close()
 3612                    log.debug(f"Write done.")
 3613
 3614                    # Update variants
 3615                    log.info(f"Annotations 'bigwig' update...")
 3616                    self.update_from_vcf(output_vcf_file)
 3617                    log.debug(f"Update done.")
 3618
 3619        return True
 3620
 3621    def annotation_snpsift(self, threads: int = None) -> None:
 3622        """
 3623        This function annotate with bcftools
 3624
 3625        :param threads: Number of threads to use
 3626        :return: the value of the variable "return_value".
 3627        """
 3628
 3629        # DEBUG
 3630        log.debug("Start annotation with bcftools databases")
 3631
 3632        # Threads
 3633        if not threads:
 3634            threads = self.get_threads()
 3635        log.debug("Threads: " + str(threads))
 3636
 3637        # Config
 3638        config = self.get_config()
 3639        log.debug("Config: " + str(config))
 3640
 3641        # Config - snpSift
 3642        snpsift_bin_command = get_bin_command(
 3643            bin="SnpSift.jar",
 3644            tool="snpsift",
 3645            bin_type="jar",
 3646            config=config,
 3647            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3648        )
 3649        if not snpsift_bin_command:
 3650            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3651            log.error(msg_err)
 3652            raise ValueError(msg_err)
 3653
 3654        # Config - bcftools
 3655        bcftools_bin_command = get_bin_command(
 3656            bin="bcftools",
 3657            tool="bcftools",
 3658            bin_type="bin",
 3659            config=config,
 3660            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3661        )
 3662        if not bcftools_bin_command:
 3663            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3664            log.error(msg_err)
 3665            raise ValueError(msg_err)
 3666
 3667        # Config - BCFTools databases folders
 3668        databases_folders = set(
 3669            self.get_config()
 3670            .get("folders", {})
 3671            .get("databases", {})
 3672            .get("annotations", ["."])
 3673            + self.get_config()
 3674            .get("folders", {})
 3675            .get("databases", {})
 3676            .get("bcftools", ["."])
 3677        )
 3678        log.debug("Databases annotations: " + str(databases_folders))
 3679
 3680        # Param
 3681        annotations = (
 3682            self.get_param()
 3683            .get("annotation", {})
 3684            .get("snpsift", {})
 3685            .get("annotations", None)
 3686        )
 3687        log.debug("Annotations: " + str(annotations))
 3688
 3689        # Assembly
 3690        assembly = self.get_param().get(
 3691            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3692        )
 3693
 3694        # Data
 3695        table_variants = self.get_table_variants()
 3696
 3697        # Check if not empty
 3698        log.debug("Check if not empty")
 3699        sql_query_chromosomes = (
 3700            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3701        )
 3702        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3703        if not sql_query_chromosomes_df["count"][0]:
 3704            log.info(f"VCF empty")
 3705            return
 3706
 3707        # VCF header
 3708        vcf_reader = self.get_header()
 3709        log.debug("Initial header: " + str(vcf_reader.infos))
 3710
 3711        # Existing annotations
 3712        for vcf_annotation in self.get_header().infos:
 3713
 3714            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3715            log.debug(
 3716                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3717            )
 3718
 3719        if annotations:
 3720
 3721            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3722
 3723                # Export VCF file
 3724                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3725
 3726                # Init
 3727                commands = {}
 3728
 3729                for annotation in annotations:
 3730                    annotation_fields = annotations[annotation]
 3731
 3732                    # Annotation Name
 3733                    annotation_name = os.path.basename(annotation)
 3734
 3735                    if not annotation_fields:
 3736                        annotation_fields = {"INFO": None}
 3737
 3738                    log.debug(f"Annotation '{annotation_name}'")
 3739                    log.debug(
 3740                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3741                    )
 3742
 3743                    # Create Database
 3744                    database = Database(
 3745                        database=annotation,
 3746                        databases_folders=databases_folders,
 3747                        assembly=assembly,
 3748                    )
 3749
 3750                    # Find files
 3751                    db_file = database.get_database()
 3752                    db_file = full_path(db_file)
 3753                    db_hdr_file = database.get_header_file()
 3754                    db_hdr_file = full_path(db_hdr_file)
 3755                    db_file_type = database.get_format()
 3756                    db_tbi_file = f"{db_file}.tbi"
 3757                    db_file_compressed = database.is_compressed()
 3758
 3759                    # Check if compressed
 3760                    if not db_file_compressed:
 3761                        log.error(
 3762                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3763                        )
 3764                        raise ValueError(
 3765                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3766                        )
 3767
 3768                    # Check if indexed
 3769                    if not os.path.exists(db_tbi_file):
 3770                        log.error(
 3771                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3772                        )
 3773                        raise ValueError(
 3774                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3775                        )
 3776
 3777                    # Check index - try to create if not exists
 3778                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3779                        log.error("Annotation failed: database not valid")
 3780                        log.error(f"Annotation annotation file: {db_file}")
 3781                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3782                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3783                        raise ValueError(
 3784                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3785                        )
 3786                    else:
 3787
 3788                        log.debug(
 3789                            f"Annotation '{annotation}' - file: "
 3790                            + str(db_file)
 3791                            + " and "
 3792                            + str(db_hdr_file)
 3793                        )
 3794
 3795                        # Load header as VCF object
 3796                        db_hdr_vcf = Variants(input=db_hdr_file)
 3797                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3798                        log.debug(
 3799                            "Annotation database header: "
 3800                            + str(db_hdr_vcf_header_infos)
 3801                        )
 3802
 3803                        # For all fields in database
 3804                        annotation_fields_full = False
 3805                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3806                            annotation_fields = {
 3807                                key: key for key in db_hdr_vcf_header_infos
 3808                            }
 3809                            log.debug(
 3810                                "Annotation database header - All annotations added: "
 3811                                + str(annotation_fields)
 3812                            )
 3813                            annotation_fields_full = True
 3814
 3815                        # # Create file for field rename
 3816                        # log.debug("Create file for field rename")
 3817                        # tmp_rename = NamedTemporaryFile(
 3818                        #     prefix=self.get_prefix(),
 3819                        #     dir=self.get_tmp_dir(),
 3820                        #     suffix=".rename",
 3821                        #     delete=False,
 3822                        # )
 3823                        # tmp_rename_name = tmp_rename.name
 3824                        # tmp_files.append(tmp_rename_name)
 3825
 3826                        # Number of fields
 3827                        nb_annotation_field = 0
 3828                        annotation_list = []
 3829                        annotation_infos_rename_list = []
 3830
 3831                        for annotation_field in annotation_fields:
 3832
 3833                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3834                            annotation_fields_new_name = annotation_fields.get(
 3835                                annotation_field, annotation_field
 3836                            )
 3837                            if not annotation_fields_new_name:
 3838                                annotation_fields_new_name = annotation_field
 3839
 3840                            # Check if field is in DB and if field is not elready in input data
 3841                            if (
 3842                                annotation_field in db_hdr_vcf.get_header().infos
 3843                                and annotation_fields_new_name
 3844                                not in self.get_header().infos
 3845                            ):
 3846
 3847                                log.info(
 3848                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3849                                )
 3850
 3851                                # BCFTools annotate param to rename fields
 3852                                if annotation_field != annotation_fields_new_name:
 3853                                    annotation_infos_rename_list.append(
 3854                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3855                                    )
 3856
 3857                                # Add INFO field to header
 3858                                db_hdr_vcf_header_infos_number = (
 3859                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3860                                )
 3861                                db_hdr_vcf_header_infos_type = (
 3862                                    db_hdr_vcf_header_infos[annotation_field].type
 3863                                    or "String"
 3864                                )
 3865                                db_hdr_vcf_header_infos_description = (
 3866                                    db_hdr_vcf_header_infos[annotation_field].desc
 3867                                    or f"{annotation_field} description"
 3868                                )
 3869                                db_hdr_vcf_header_infos_source = (
 3870                                    db_hdr_vcf_header_infos[annotation_field].source
 3871                                    or "unknown"
 3872                                )
 3873                                db_hdr_vcf_header_infos_version = (
 3874                                    db_hdr_vcf_header_infos[annotation_field].version
 3875                                    or "unknown"
 3876                                )
 3877
 3878                                vcf_reader.infos[annotation_fields_new_name] = (
 3879                                    vcf.parser._Info(
 3880                                        annotation_fields_new_name,
 3881                                        db_hdr_vcf_header_infos_number,
 3882                                        db_hdr_vcf_header_infos_type,
 3883                                        db_hdr_vcf_header_infos_description,
 3884                                        db_hdr_vcf_header_infos_source,
 3885                                        db_hdr_vcf_header_infos_version,
 3886                                        self.code_type_map[
 3887                                            db_hdr_vcf_header_infos_type
 3888                                        ],
 3889                                    )
 3890                                )
 3891
 3892                                annotation_list.append(annotation_field)
 3893
 3894                                nb_annotation_field += 1
 3895
 3896                            else:
 3897
 3898                                if (
 3899                                    annotation_field
 3900                                    not in db_hdr_vcf.get_header().infos
 3901                                ):
 3902                                    log.warning(
 3903                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3904                                    )
 3905                                if (
 3906                                    annotation_fields_new_name
 3907                                    in self.get_header().infos
 3908                                ):
 3909                                    log.warning(
 3910                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3911                                    )
 3912
 3913                        log.info(
 3914                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3915                        )
 3916
 3917                        annotation_infos = ",".join(annotation_list)
 3918
 3919                        if annotation_infos != "":
 3920
 3921                            # Annotated VCF (and error file)
 3922                            tmp_annotation_vcf_name = os.path.join(
 3923                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3924                            )
 3925                            tmp_annotation_vcf_name_err = (
 3926                                tmp_annotation_vcf_name + ".err"
 3927                            )
 3928
 3929                            # Add fields to annotate
 3930                            if not annotation_fields_full:
 3931                                annotation_infos_option = f"-info {annotation_infos}"
 3932                            else:
 3933                                annotation_infos_option = ""
 3934
 3935                            # Info fields rename
 3936                            if annotation_infos_rename_list:
 3937                                annotation_infos_rename = " -c " + ",".join(
 3938                                    annotation_infos_rename_list
 3939                                )
 3940                            else:
 3941                                annotation_infos_rename = ""
 3942
 3943                            # Annotate command
 3944                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3945
 3946                            # Add command
 3947                            commands[command_annotate] = tmp_annotation_vcf_name
 3948
 3949                if commands:
 3950
 3951                    # Export VCF file
 3952                    self.export_variant_vcf(
 3953                        vcf_file=tmp_vcf_name,
 3954                        remove_info=True,
 3955                        add_samples=False,
 3956                        index=True,
 3957                    )
 3958                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3959
 3960                    # Num command
 3961                    nb_command = 0
 3962
 3963                    # Annotate
 3964                    for command_annotate in commands:
 3965                        nb_command += 1
 3966                        log.info(
 3967                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3968                        )
 3969                        log.debug(f"command_annotate={command_annotate}")
 3970                        run_parallel_commands([command_annotate], threads)
 3971
 3972                        # Debug
 3973                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3974
 3975                        # Update variants
 3976                        log.info(
 3977                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3978                        )
 3979                        self.update_from_vcf(commands[command_annotate])
 3980
 3981    def annotation_bcftools(self, threads: int = None) -> None:
 3982        """
 3983        This function annotate with bcftools
 3984
 3985        :param threads: Number of threads to use
 3986        :return: the value of the variable "return_value".
 3987        """
 3988
 3989        # DEBUG
 3990        log.debug("Start annotation with bcftools databases")
 3991
 3992        # Threads
 3993        if not threads:
 3994            threads = self.get_threads()
 3995        log.debug("Threads: " + str(threads))
 3996
 3997        # Config
 3998        config = self.get_config()
 3999        log.debug("Config: " + str(config))
 4000
 4001        # DEBUG
 4002        delete_tmp = True
 4003        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4004            delete_tmp = False
 4005            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4006
 4007        # Config - BCFTools bin command
 4008        bcftools_bin_command = get_bin_command(
 4009            bin="bcftools",
 4010            tool="bcftools",
 4011            bin_type="bin",
 4012            config=config,
 4013            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 4014        )
 4015        if not bcftools_bin_command:
 4016            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 4017            log.error(msg_err)
 4018            raise ValueError(msg_err)
 4019
 4020        # Config - BCFTools databases folders
 4021        databases_folders = set(
 4022            self.get_config()
 4023            .get("folders", {})
 4024            .get("databases", {})
 4025            .get("annotations", ["."])
 4026            + self.get_config()
 4027            .get("folders", {})
 4028            .get("databases", {})
 4029            .get("bcftools", ["."])
 4030        )
 4031        log.debug("Databases annotations: " + str(databases_folders))
 4032
 4033        # Param
 4034        annotations = (
 4035            self.get_param()
 4036            .get("annotation", {})
 4037            .get("bcftools", {})
 4038            .get("annotations", None)
 4039        )
 4040        log.debug("Annotations: " + str(annotations))
 4041
 4042        # Assembly
 4043        assembly = self.get_param().get(
 4044            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4045        )
 4046
 4047        # Data
 4048        table_variants = self.get_table_variants()
 4049
 4050        # Check if not empty
 4051        log.debug("Check if not empty")
 4052        sql_query_chromosomes = (
 4053            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4054        )
 4055        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4056        if not sql_query_chromosomes_df["count"][0]:
 4057            log.info(f"VCF empty")
 4058            return
 4059
 4060        # Export in VCF
 4061        log.debug("Create initial file to annotate")
 4062        tmp_vcf = NamedTemporaryFile(
 4063            prefix=self.get_prefix(),
 4064            dir=self.get_tmp_dir(),
 4065            suffix=".vcf.gz",
 4066            delete=False,
 4067        )
 4068        tmp_vcf_name = tmp_vcf.name
 4069
 4070        # VCF header
 4071        vcf_reader = self.get_header()
 4072        log.debug("Initial header: " + str(vcf_reader.infos))
 4073
 4074        # Existing annotations
 4075        for vcf_annotation in self.get_header().infos:
 4076
 4077            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4078            log.debug(
 4079                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4080            )
 4081
 4082        if annotations:
 4083
 4084            tmp_ann_vcf_list = []
 4085            commands = []
 4086            tmp_files = []
 4087            err_files = []
 4088
 4089            for annotation in annotations:
 4090                annotation_fields = annotations[annotation]
 4091
 4092                # Annotation Name
 4093                annotation_name = os.path.basename(annotation)
 4094
 4095                if not annotation_fields:
 4096                    annotation_fields = {"INFO": None}
 4097
 4098                log.debug(f"Annotation '{annotation_name}'")
 4099                log.debug(
 4100                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4101                )
 4102
 4103                # Create Database
 4104                database = Database(
 4105                    database=annotation,
 4106                    databases_folders=databases_folders,
 4107                    assembly=assembly,
 4108                )
 4109
 4110                # Find files
 4111                db_file = database.get_database()
 4112                db_file = full_path(db_file)
 4113                db_hdr_file = database.get_header_file()
 4114                db_hdr_file = full_path(db_hdr_file)
 4115                db_file_type = database.get_format()
 4116                db_tbi_file = f"{db_file}.tbi"
 4117                db_file_compressed = database.is_compressed()
 4118
 4119                # Check if compressed
 4120                if not db_file_compressed:
 4121                    log.error(
 4122                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4123                    )
 4124                    raise ValueError(
 4125                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4126                    )
 4127
 4128                # Check if indexed
 4129                if not os.path.exists(db_tbi_file):
 4130                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4131                    raise ValueError(
 4132                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4133                    )
 4134
 4135                # Check index - try to create if not exists
 4136                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4137                    log.error("Annotation failed: database not valid")
 4138                    log.error(f"Annotation annotation file: {db_file}")
 4139                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4140                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4141                    raise ValueError(
 4142                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4143                    )
 4144                else:
 4145
 4146                    log.debug(
 4147                        f"Annotation '{annotation}' - file: "
 4148                        + str(db_file)
 4149                        + " and "
 4150                        + str(db_hdr_file)
 4151                    )
 4152
 4153                    # Load header as VCF object
 4154                    db_hdr_vcf = Variants(input=db_hdr_file)
 4155                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4156                    log.debug(
 4157                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4158                    )
 4159
 4160                    # For all fields in database
 4161                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4162                        annotation_fields = {
 4163                            key: key for key in db_hdr_vcf_header_infos
 4164                        }
 4165                        log.debug(
 4166                            "Annotation database header - All annotations added: "
 4167                            + str(annotation_fields)
 4168                        )
 4169
 4170                    # Number of fields
 4171                    nb_annotation_field = 0
 4172                    annotation_list = []
 4173
 4174                    for annotation_field in annotation_fields:
 4175
 4176                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4177                        annotation_fields_new_name = annotation_fields.get(
 4178                            annotation_field, annotation_field
 4179                        )
 4180                        if not annotation_fields_new_name:
 4181                            annotation_fields_new_name = annotation_field
 4182
 4183                        # Check if field is in DB and if field is not elready in input data
 4184                        if (
 4185                            annotation_field in db_hdr_vcf.get_header().infos
 4186                            and annotation_fields_new_name
 4187                            not in self.get_header().infos
 4188                        ):
 4189
 4190                            log.info(
 4191                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4192                            )
 4193
 4194                            # Add INFO field to header
 4195                            db_hdr_vcf_header_infos_number = (
 4196                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4197                            )
 4198                            db_hdr_vcf_header_infos_type = (
 4199                                db_hdr_vcf_header_infos[annotation_field].type
 4200                                or "String"
 4201                            )
 4202                            db_hdr_vcf_header_infos_description = (
 4203                                db_hdr_vcf_header_infos[annotation_field].desc
 4204                                or f"{annotation_field} description"
 4205                            )
 4206                            db_hdr_vcf_header_infos_source = (
 4207                                db_hdr_vcf_header_infos[annotation_field].source
 4208                                or "unknown"
 4209                            )
 4210                            db_hdr_vcf_header_infos_version = (
 4211                                db_hdr_vcf_header_infos[annotation_field].version
 4212                                or "unknown"
 4213                            )
 4214
 4215                            vcf_reader.infos[annotation_fields_new_name] = (
 4216                                vcf.parser._Info(
 4217                                    annotation_fields_new_name,
 4218                                    db_hdr_vcf_header_infos_number,
 4219                                    db_hdr_vcf_header_infos_type,
 4220                                    db_hdr_vcf_header_infos_description,
 4221                                    db_hdr_vcf_header_infos_source,
 4222                                    db_hdr_vcf_header_infos_version,
 4223                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4224                                )
 4225                            )
 4226
 4227                            # annotation_list.append(annotation_field)
 4228                            if annotation_field != annotation_fields_new_name:
 4229                                annotation_list.append(
 4230                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4231                                )
 4232                            else:
 4233                                annotation_list.append(annotation_field)
 4234
 4235                            nb_annotation_field += 1
 4236
 4237                        else:
 4238
 4239                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4240                                log.warning(
 4241                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4242                                )
 4243                            if annotation_fields_new_name in self.get_header().infos:
 4244                                log.warning(
 4245                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4246                                )
 4247
 4248                    log.info(
 4249                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4250                    )
 4251
 4252                    annotation_infos = ",".join(annotation_list)
 4253
 4254                    if annotation_infos != "":
 4255
 4256                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4257                        log.debug("Protect Header file - remove #CHROM line if exists")
 4258                        tmp_header_vcf = NamedTemporaryFile(
 4259                            prefix=self.get_prefix(),
 4260                            dir=self.get_tmp_dir(),
 4261                            suffix=".hdr",
 4262                            delete=False,
 4263                        )
 4264                        tmp_header_vcf_name = tmp_header_vcf.name
 4265                        tmp_files.append(tmp_header_vcf_name)
 4266                        # Command
 4267                        if db_hdr_file.endswith(".gz"):
 4268                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4269                        else:
 4270                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4271                        # Run
 4272                        run_parallel_commands([command_extract_header], 1)
 4273
 4274                        # Find chomosomes
 4275                        log.debug("Find chromosomes ")
 4276                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4277                        sql_query_chromosomes_df = self.get_query_to_df(
 4278                            sql_query_chromosomes
 4279                        )
 4280                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4281
 4282                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4283
 4284                        # BED columns in the annotation file
 4285                        if db_file_type in ["bed"]:
 4286                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4287
 4288                        for chrom in chomosomes_list:
 4289
 4290                            # Create BED on initial VCF
 4291                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4292                            tmp_bed = NamedTemporaryFile(
 4293                                prefix=self.get_prefix(),
 4294                                dir=self.get_tmp_dir(),
 4295                                suffix=".bed",
 4296                                delete=False,
 4297                            )
 4298                            tmp_bed_name = tmp_bed.name
 4299                            tmp_files.append(tmp_bed_name)
 4300
 4301                            # Detecte regions
 4302                            log.debug(
 4303                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4304                            )
 4305                            window = 1000000
 4306                            sql_query_intervals_for_bed = f"""
 4307                                SELECT  \"#CHROM\",
 4308                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4309                                        \"POS\"+{window}
 4310                                FROM {table_variants} as table_variants
 4311                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4312                            """
 4313                            regions = self.conn.execute(
 4314                                sql_query_intervals_for_bed
 4315                            ).fetchall()
 4316                            merged_regions = merge_regions(regions)
 4317                            log.debug(
 4318                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4319                            )
 4320
 4321                            header = ["#CHROM", "START", "END"]
 4322                            with open(tmp_bed_name, "w") as f:
 4323                                # Write the header with tab delimiter
 4324                                f.write("\t".join(header) + "\n")
 4325                                for d in merged_regions:
 4326                                    # Write each data row with tab delimiter
 4327                                    f.write("\t".join(map(str, d)) + "\n")
 4328
 4329                            # Tmp files
 4330                            tmp_annotation_vcf = NamedTemporaryFile(
 4331                                prefix=self.get_prefix(),
 4332                                dir=self.get_tmp_dir(),
 4333                                suffix=".vcf.gz",
 4334                                delete=False,
 4335                            )
 4336                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4337                            tmp_files.append(tmp_annotation_vcf_name)
 4338                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4339                            tmp_annotation_vcf_name_err = (
 4340                                tmp_annotation_vcf_name + ".err"
 4341                            )
 4342                            err_files.append(tmp_annotation_vcf_name_err)
 4343
 4344                            # Annotate Command
 4345                            log.debug(
 4346                                f"Annotation '{annotation}' - add bcftools command"
 4347                            )
 4348
 4349                            # Command
 4350                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4351
 4352                            # Add command
 4353                            commands.append(command_annotate)
 4354
 4355            # if some commands
 4356            if commands:
 4357
 4358                # Export VCF file
 4359                self.export_variant_vcf(
 4360                    vcf_file=tmp_vcf_name,
 4361                    remove_info=True,
 4362                    add_samples=False,
 4363                    index=True,
 4364                )
 4365
 4366                # Threads
 4367                # calculate threads for annotated commands
 4368                if commands:
 4369                    threads_bcftools_annotate = round(threads / len(commands))
 4370                else:
 4371                    threads_bcftools_annotate = 1
 4372
 4373                if not threads_bcftools_annotate:
 4374                    threads_bcftools_annotate = 1
 4375
 4376                # Add threads option to bcftools commands
 4377                if threads_bcftools_annotate > 1:
 4378                    commands_threaded = []
 4379                    for command in commands:
 4380                        commands_threaded.append(
 4381                            command.replace(
 4382                                f"{bcftools_bin_command} annotate ",
 4383                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4384                            )
 4385                        )
 4386                    commands = commands_threaded
 4387
 4388                # Command annotation multithreading
 4389                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4390                log.info(
 4391                    f"Annotation - Annotation multithreaded in "
 4392                    + str(len(commands))
 4393                    + " commands"
 4394                )
 4395
 4396                run_parallel_commands(commands, threads)
 4397
 4398                # Merge
 4399                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4400
 4401                if tmp_ann_vcf_list_cmd:
 4402
 4403                    # Tmp file
 4404                    tmp_annotate_vcf = NamedTemporaryFile(
 4405                        prefix=self.get_prefix(),
 4406                        dir=self.get_tmp_dir(),
 4407                        suffix=".vcf.gz",
 4408                        delete=True,
 4409                    )
 4410                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4411                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4412                    err_files.append(tmp_annotate_vcf_name_err)
 4413
 4414                    # Tmp file remove command
 4415                    tmp_files_remove_command = ""
 4416                    if tmp_files:
 4417                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4418
 4419                    # Command merge
 4420                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4421                    log.info(
 4422                        f"Annotation - Annotation merging "
 4423                        + str(len(commands))
 4424                        + " annotated files"
 4425                    )
 4426                    log.debug(f"Annotation - merge command: {merge_command}")
 4427                    run_parallel_commands([merge_command], 1)
 4428
 4429                    # Error messages
 4430                    log.info(f"Error/Warning messages:")
 4431                    error_message_command_all = []
 4432                    error_message_command_warning = []
 4433                    error_message_command_err = []
 4434                    for err_file in err_files:
 4435                        with open(err_file, "r") as f:
 4436                            for line in f:
 4437                                message = line.strip()
 4438                                error_message_command_all.append(message)
 4439                                if line.startswith("[W::"):
 4440                                    error_message_command_warning.append(message)
 4441                                if line.startswith("[E::"):
 4442                                    error_message_command_err.append(
 4443                                        f"{err_file}: " + message
 4444                                    )
 4445                    # log info
 4446                    for message in list(
 4447                        set(error_message_command_err + error_message_command_warning)
 4448                    ):
 4449                        log.info(f"   {message}")
 4450                    # debug info
 4451                    for message in list(set(error_message_command_all)):
 4452                        log.debug(f"   {message}")
 4453                    # failed
 4454                    if len(error_message_command_err):
 4455                        log.error("Annotation failed: Error in commands")
 4456                        raise ValueError("Annotation failed: Error in commands")
 4457
 4458                    # Update variants
 4459                    log.info(f"Annotation - Updating...")
 4460                    self.update_from_vcf(tmp_annotate_vcf_name)
 4461
 4462    def annotation_exomiser(self, threads: int = None) -> None:
 4463        """
 4464        This function annotate with Exomiser
 4465
 4466        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4467        - "analysis" (dict/file):
 4468            Full analysis dictionnary parameters (see Exomiser docs).
 4469            Either a dict, or a file in JSON or YAML format.
 4470            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4471            Default : None
 4472        - "preset" (string):
 4473            Analysis preset (available in config folder).
 4474            Used if no full "analysis" is provided.
 4475            Default: "exome"
 4476        - "phenopacket" (dict/file):
 4477            Samples and phenotipic features parameters (see Exomiser docs).
 4478            Either a dict, or a file in JSON or YAML format.
 4479            Default: None
 4480        - "subject" (dict):
 4481            Sample parameters (see Exomiser docs).
 4482            Example:
 4483                "subject":
 4484                    {
 4485                        "id": "ISDBM322017",
 4486                        "sex": "FEMALE"
 4487                    }
 4488            Default: None
 4489        - "sample" (string):
 4490            Sample name to construct "subject" section:
 4491                "subject":
 4492                    {
 4493                        "id": "<sample>",
 4494                        "sex": "UNKNOWN_SEX"
 4495                    }
 4496            Default: None
 4497        - "phenotypicFeatures" (dict)
 4498            Phenotypic features to construct "subject" section.
 4499            Example:
 4500                "phenotypicFeatures":
 4501                    [
 4502                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4503                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4504                    ]
 4505        - "hpo" (list)
 4506            List of HPO ids as phenotypic features.
 4507            Example:
 4508                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4509            Default: []
 4510        - "outputOptions" (dict):
 4511            Output options (see Exomiser docs).
 4512            Default:
 4513                "output_options" =
 4514                    {
 4515                        "outputContributingVariantsOnly": False,
 4516                        "numGenes": 0,
 4517                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4518                    }
 4519        - "transcript_source" (string):
 4520            Transcript source (either "refseq", "ucsc", "ensembl")
 4521            Default: "refseq"
 4522        - "exomiser_to_info" (boolean):
 4523            Add exomiser TSV file columns as INFO fields in VCF.
 4524            Default: False
 4525        - "release" (string):
 4526            Exomise database release.
 4527            If not exists, database release will be downloaded (take a while).
 4528            Default: None (provided by application.properties configuration file)
 4529        - "exomiser_application_properties" (file):
 4530            Exomiser configuration file (see Exomiser docs).
 4531            Useful to automatically download databases (especially for specific genome databases).
 4532
 4533        Notes:
 4534        - If no sample in parameters, first sample in VCF will be chosen
 4535        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4536
 4537        :param threads: The number of threads to use
 4538        :return: None.
 4539        """
 4540
 4541        # DEBUG
 4542        log.debug("Start annotation with Exomiser databases")
 4543
 4544        # Threads
 4545        if not threads:
 4546            threads = self.get_threads()
 4547        log.debug("Threads: " + str(threads))
 4548
 4549        # Config
 4550        config = self.get_config()
 4551        log.debug("Config: " + str(config))
 4552
 4553        # Config - Folders - Databases
 4554        databases_folders = (
 4555            config.get("folders", {})
 4556            .get("databases", {})
 4557            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4558        )
 4559        databases_folders = full_path(databases_folders)
 4560        if not os.path.exists(databases_folders):
 4561            log.error(f"Databases annotations: {databases_folders} NOT found")
 4562        log.debug("Databases annotations: " + str(databases_folders))
 4563
 4564        # Config - Exomiser
 4565        exomiser_bin_command = get_bin_command(
 4566            bin="exomiser-cli*.jar",
 4567            tool="exomiser",
 4568            bin_type="jar",
 4569            config=config,
 4570            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4571        )
 4572        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4573        if not exomiser_bin_command:
 4574            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4575            log.error(msg_err)
 4576            raise ValueError(msg_err)
 4577
 4578        # Param
 4579        param = self.get_param()
 4580        log.debug("Param: " + str(param))
 4581
 4582        # Param - Exomiser
 4583        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4584        log.debug(f"Param Exomiser: {param_exomiser}")
 4585
 4586        # Param - Assembly
 4587        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4588        log.debug("Assembly: " + str(assembly))
 4589
 4590        # Data
 4591        table_variants = self.get_table_variants()
 4592
 4593        # Check if not empty
 4594        log.debug("Check if not empty")
 4595        sql_query_chromosomes = (
 4596            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4597        )
 4598        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4599            log.info(f"VCF empty")
 4600            return False
 4601
 4602        # VCF header
 4603        vcf_reader = self.get_header()
 4604        log.debug("Initial header: " + str(vcf_reader.infos))
 4605
 4606        # Samples
 4607        samples = self.get_header_sample_list()
 4608        if not samples:
 4609            log.error("No Samples in VCF")
 4610            return False
 4611        log.debug(f"Samples: {samples}")
 4612
 4613        # Memory limit
 4614        memory_limit = self.get_memory("8G")
 4615        log.debug(f"memory_limit: {memory_limit}")
 4616
 4617        # Exomiser java options
 4618        exomiser_java_options = (
 4619            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4620        )
 4621        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4622
 4623        # Download Exomiser (if not exists)
 4624        exomiser_release = param_exomiser.get("release", None)
 4625        exomiser_application_properties = param_exomiser.get(
 4626            "exomiser_application_properties", None
 4627        )
 4628        databases_download_exomiser(
 4629            assemblies=[assembly],
 4630            exomiser_folder=databases_folders,
 4631            exomiser_release=exomiser_release,
 4632            exomiser_phenotype_release=exomiser_release,
 4633            exomiser_application_properties=exomiser_application_properties,
 4634        )
 4635
 4636        # Force annotation
 4637        force_update_annotation = True
 4638
 4639        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4640            log.debug("Start annotation Exomiser")
 4641
 4642            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4643
 4644                # tmp_dir = "/tmp/exomiser"
 4645
 4646                ### ANALYSIS ###
 4647                ################
 4648
 4649                # Create analysis.json through analysis dict
 4650                # either analysis in param or by default
 4651                # depending on preset exome/genome)
 4652
 4653                # Init analysis dict
 4654                param_exomiser_analysis_dict = {}
 4655
 4656                # analysis from param
 4657                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4658                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4659
 4660                # If analysis in param -> load anlaysis json
 4661                if param_exomiser_analysis:
 4662
 4663                    # If param analysis is a file and exists
 4664                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4665                        param_exomiser_analysis
 4666                    ):
 4667                        # Load analysis file into analysis dict (either yaml or json)
 4668                        with open(param_exomiser_analysis) as json_file:
 4669                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4670
 4671                    # If param analysis is a dict
 4672                    elif isinstance(param_exomiser_analysis, dict):
 4673                        # Load analysis dict into analysis dict (either yaml or json)
 4674                        param_exomiser_analysis_dict = param_exomiser_analysis
 4675
 4676                    # Error analysis type
 4677                    else:
 4678                        log.error(f"Analysis type unknown. Check param file.")
 4679                        raise ValueError(f"Analysis type unknown. Check param file.")
 4680
 4681                # Case no input analysis config file/dict
 4682                # Use preset (exome/genome) to open default config file
 4683                if not param_exomiser_analysis_dict:
 4684
 4685                    # default preset
 4686                    default_preset = "exome"
 4687
 4688                    # Get param preset or default preset
 4689                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4690
 4691                    # Try to find if preset is a file
 4692                    if os.path.exists(param_exomiser_preset):
 4693                        # Preset file is provided in full path
 4694                        param_exomiser_analysis_default_config_file = (
 4695                            param_exomiser_preset
 4696                        )
 4697                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4698                    #     # Preset file is provided in full path
 4699                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4700                    elif os.path.exists(
 4701                        os.path.join(folder_config, param_exomiser_preset)
 4702                    ):
 4703                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4704                        param_exomiser_analysis_default_config_file = os.path.join(
 4705                            folder_config, param_exomiser_preset
 4706                        )
 4707                    else:
 4708                        # Construct preset file
 4709                        param_exomiser_analysis_default_config_file = os.path.join(
 4710                            folder_config,
 4711                            f"preset-{param_exomiser_preset}-analysis.json",
 4712                        )
 4713
 4714                    # If preset file exists
 4715                    param_exomiser_analysis_default_config_file = full_path(
 4716                        param_exomiser_analysis_default_config_file
 4717                    )
 4718                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4719                        # Load prest file into analysis dict (either yaml or json)
 4720                        with open(
 4721                            param_exomiser_analysis_default_config_file
 4722                        ) as json_file:
 4723                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4724                                json_file
 4725                            )
 4726
 4727                    # Error preset file
 4728                    else:
 4729                        log.error(
 4730                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4731                        )
 4732                        raise ValueError(
 4733                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4734                        )
 4735
 4736                # If no analysis dict created
 4737                if not param_exomiser_analysis_dict:
 4738                    log.error(f"No analysis config")
 4739                    raise ValueError(f"No analysis config")
 4740
 4741                # Log
 4742                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4743
 4744                ### PHENOPACKET ###
 4745                ###################
 4746
 4747                # If no PhenoPacket in analysis dict -> check in param
 4748                if "phenopacket" not in param_exomiser_analysis_dict:
 4749
 4750                    # If PhenoPacket in param -> load anlaysis json
 4751                    if param_exomiser.get("phenopacket", None):
 4752
 4753                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4754                        param_exomiser_phenopacket = full_path(
 4755                            param_exomiser_phenopacket
 4756                        )
 4757
 4758                        # If param phenopacket is a file and exists
 4759                        if isinstance(
 4760                            param_exomiser_phenopacket, str
 4761                        ) and os.path.exists(param_exomiser_phenopacket):
 4762                            # Load phenopacket file into analysis dict (either yaml or json)
 4763                            with open(param_exomiser_phenopacket) as json_file:
 4764                                param_exomiser_analysis_dict["phenopacket"] = (
 4765                                    yaml.safe_load(json_file)
 4766                                )
 4767
 4768                        # If param phenopacket is a dict
 4769                        elif isinstance(param_exomiser_phenopacket, dict):
 4770                            # Load phenopacket dict into analysis dict (either yaml or json)
 4771                            param_exomiser_analysis_dict["phenopacket"] = (
 4772                                param_exomiser_phenopacket
 4773                            )
 4774
 4775                        # Error phenopacket type
 4776                        else:
 4777                            log.error(f"Phenopacket type unknown. Check param file.")
 4778                            raise ValueError(
 4779                                f"Phenopacket type unknown. Check param file."
 4780                            )
 4781
 4782                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4783                if "phenopacket" not in param_exomiser_analysis_dict:
 4784
 4785                    # Init PhenoPacket
 4786                    param_exomiser_analysis_dict["phenopacket"] = {
 4787                        "id": "analysis",
 4788                        "proband": {},
 4789                    }
 4790
 4791                    ### Add subject ###
 4792
 4793                    # If subject exists
 4794                    param_exomiser_subject = param_exomiser.get("subject", {})
 4795
 4796                    # If subject not exists -> found sample ID
 4797                    if not param_exomiser_subject:
 4798
 4799                        # Found sample ID in param
 4800                        sample = param_exomiser.get("sample", None)
 4801
 4802                        # Find sample ID (first sample)
 4803                        if not sample:
 4804                            sample_list = self.get_header_sample_list()
 4805                            if len(sample_list) > 0:
 4806                                sample = sample_list[0]
 4807                            else:
 4808                                log.error(f"No sample found")
 4809                                raise ValueError(f"No sample found")
 4810
 4811                        # Create subject
 4812                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4813
 4814                    # Add to dict
 4815                    param_exomiser_analysis_dict["phenopacket"][
 4816                        "subject"
 4817                    ] = param_exomiser_subject
 4818
 4819                    ### Add "phenotypicFeatures" ###
 4820
 4821                    # If phenotypicFeatures exists
 4822                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4823                        "phenotypicFeatures", []
 4824                    )
 4825
 4826                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4827                    if not param_exomiser_phenotypicfeatures:
 4828
 4829                        # Found HPO in param
 4830                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4831
 4832                        # Split HPO if list in string format separated by comma
 4833                        if isinstance(param_exomiser_hpo, str):
 4834                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4835
 4836                        # Create HPO list
 4837                        for hpo in param_exomiser_hpo:
 4838                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4839                            param_exomiser_phenotypicfeatures.append(
 4840                                {
 4841                                    "type": {
 4842                                        "id": f"HP:{hpo_clean}",
 4843                                        "label": f"HP:{hpo_clean}",
 4844                                    }
 4845                                }
 4846                            )
 4847
 4848                    # Add to dict
 4849                    param_exomiser_analysis_dict["phenopacket"][
 4850                        "phenotypicFeatures"
 4851                    ] = param_exomiser_phenotypicfeatures
 4852
 4853                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4854                    if not param_exomiser_phenotypicfeatures:
 4855                        for step in param_exomiser_analysis_dict.get(
 4856                            "analysis", {}
 4857                        ).get("steps", []):
 4858                            if "hiPhivePrioritiser" in step:
 4859                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4860                                    "steps", []
 4861                                ).remove(step)
 4862
 4863                ### Add Input File ###
 4864
 4865                # Initial file name and htsFiles
 4866                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4867                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4868                    {
 4869                        "uri": tmp_vcf_name,
 4870                        "htsFormat": "VCF",
 4871                        "genomeAssembly": assembly,
 4872                    }
 4873                ]
 4874
 4875                ### Add metaData ###
 4876
 4877                # If metaData not in analysis dict
 4878                if "metaData" not in param_exomiser_analysis_dict:
 4879                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4880                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4881                        "createdBy": "howard",
 4882                        "phenopacketSchemaVersion": 1,
 4883                    }
 4884
 4885                ### OutputOptions ###
 4886
 4887                # Init output result folder
 4888                output_results = os.path.join(tmp_dir, "results")
 4889
 4890                # If no outputOptions in analysis dict
 4891                if "outputOptions" not in param_exomiser_analysis_dict:
 4892
 4893                    # default output formats
 4894                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4895
 4896                    # Get outputOptions in param
 4897                    output_options = param_exomiser.get("outputOptions", None)
 4898
 4899                    # If no output_options in param -> check
 4900                    if not output_options:
 4901                        output_options = {
 4902                            "outputContributingVariantsOnly": False,
 4903                            "numGenes": 0,
 4904                            "outputFormats": defaut_output_formats,
 4905                        }
 4906
 4907                    # Replace outputDirectory in output options
 4908                    output_options["outputDirectory"] = output_results
 4909                    output_options["outputFileName"] = "howard"
 4910
 4911                    # Add outputOptions in analysis dict
 4912                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4913
 4914                else:
 4915
 4916                    # Replace output_results and output format (if exists in param)
 4917                    param_exomiser_analysis_dict["outputOptions"][
 4918                        "outputDirectory"
 4919                    ] = output_results
 4920                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4921                        list(
 4922                            set(
 4923                                param_exomiser_analysis_dict.get(
 4924                                    "outputOptions", {}
 4925                                ).get("outputFormats", [])
 4926                                + ["TSV_VARIANT", "VCF"]
 4927                            )
 4928                        )
 4929                    )
 4930
 4931                # log
 4932                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4933
 4934                ### ANALYSIS FILE ###
 4935                #####################
 4936
 4937                ### Full JSON analysis config file ###
 4938
 4939                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4940                with open(exomiser_analysis, "w") as fp:
 4941                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4942
 4943                ### SPLIT analysis and sample config files
 4944
 4945                # Splitted analysis dict
 4946                param_exomiser_analysis_dict_for_split = (
 4947                    param_exomiser_analysis_dict.copy()
 4948                )
 4949
 4950                # Phenopacket JSON file
 4951                exomiser_analysis_phenopacket = os.path.join(
 4952                    tmp_dir, "analysis_phenopacket.json"
 4953                )
 4954                with open(exomiser_analysis_phenopacket, "w") as fp:
 4955                    json.dump(
 4956                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4957                        fp,
 4958                        indent=4,
 4959                    )
 4960
 4961                # Analysis JSON file without Phenopacket parameters
 4962                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4963                exomiser_analysis_analysis = os.path.join(
 4964                    tmp_dir, "analysis_analysis.json"
 4965                )
 4966                with open(exomiser_analysis_analysis, "w") as fp:
 4967                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4968
 4969                ### INITAL VCF file ###
 4970                #######################
 4971
 4972                ### Create list of samples to use and include inti initial VCF file ####
 4973
 4974                # Subject (main sample)
 4975                # Get sample ID in analysis dict
 4976                sample_subject = (
 4977                    param_exomiser_analysis_dict.get("phenopacket", {})
 4978                    .get("subject", {})
 4979                    .get("id", None)
 4980                )
 4981                sample_proband = (
 4982                    param_exomiser_analysis_dict.get("phenopacket", {})
 4983                    .get("proband", {})
 4984                    .get("subject", {})
 4985                    .get("id", None)
 4986                )
 4987                sample = []
 4988                if sample_subject:
 4989                    sample.append(sample_subject)
 4990                if sample_proband:
 4991                    sample.append(sample_proband)
 4992
 4993                # Get sample ID within Pedigree
 4994                pedigree_persons_list = (
 4995                    param_exomiser_analysis_dict.get("phenopacket", {})
 4996                    .get("pedigree", {})
 4997                    .get("persons", {})
 4998                )
 4999
 5000                # Create list with all sample ID in pedigree (if exists)
 5001                pedigree_persons = []
 5002                for person in pedigree_persons_list:
 5003                    pedigree_persons.append(person.get("individualId"))
 5004
 5005                # Concat subject sample ID and samples ID in pedigreesamples
 5006                samples = list(set(sample + pedigree_persons))
 5007
 5008                # Check if sample list is not empty
 5009                if not samples:
 5010                    log.error(f"No samples found")
 5011                    raise ValueError(f"No samples found")
 5012
 5013                # Create VCF with sample (either sample in param or first one by default)
 5014                # Export VCF file
 5015                self.export_variant_vcf(
 5016                    vcf_file=tmp_vcf_name,
 5017                    remove_info=True,
 5018                    add_samples=True,
 5019                    list_samples=samples,
 5020                    index=False,
 5021                )
 5022
 5023                ### Execute Exomiser ###
 5024                ########################
 5025
 5026                # Init command
 5027                exomiser_command = ""
 5028
 5029                # Command exomiser options
 5030                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5031
 5032                # Release
 5033                exomiser_release = param_exomiser.get("release", None)
 5034                if exomiser_release:
 5035                    # phenotype data version
 5036                    exomiser_options += (
 5037                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5038                    )
 5039                    # data version
 5040                    exomiser_options += (
 5041                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5042                    )
 5043                    # variant white list
 5044                    variant_white_list_file = (
 5045                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5046                    )
 5047                    if os.path.exists(
 5048                        os.path.join(
 5049                            databases_folders, assembly, variant_white_list_file
 5050                        )
 5051                    ):
 5052                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5053
 5054                # transcript_source
 5055                transcript_source = param_exomiser.get(
 5056                    "transcript_source", None
 5057                )  # ucsc, refseq, ensembl
 5058                if transcript_source:
 5059                    exomiser_options += (
 5060                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5061                    )
 5062
 5063                # If analysis contain proband param
 5064                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5065                    "proband", {}
 5066                ):
 5067                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5068
 5069                # If no proband (usually uniq sample)
 5070                else:
 5071                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5072
 5073                # Log
 5074                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5075
 5076                # Run command
 5077                result = subprocess.call(
 5078                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5079                )
 5080                if result:
 5081                    log.error("Exomiser command failed")
 5082                    raise ValueError("Exomiser command failed")
 5083
 5084                ### RESULTS ###
 5085                ###############
 5086
 5087                ### Annotate with TSV fields ###
 5088
 5089                # Init result tsv file
 5090                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5091
 5092                # Init result tsv file
 5093                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5094
 5095                # Parse TSV file and explode columns in INFO field
 5096                if exomiser_to_info and os.path.exists(output_results_tsv):
 5097
 5098                    # Log
 5099                    log.debug("Exomiser columns to VCF INFO field")
 5100
 5101                    # Retrieve columns and types
 5102                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5103                    output_results_tsv_df = self.get_query_to_df(query)
 5104                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5105
 5106                    # Init concat fields for update
 5107                    sql_query_update_concat_fields = []
 5108
 5109                    # Fields to avoid
 5110                    fields_to_avoid = [
 5111                        "CONTIG",
 5112                        "START",
 5113                        "END",
 5114                        "REF",
 5115                        "ALT",
 5116                        "QUAL",
 5117                        "FILTER",
 5118                        "GENOTYPE",
 5119                    ]
 5120
 5121                    # List all columns to add into header
 5122                    for header_column in output_results_tsv_columns:
 5123
 5124                        # If header column is enable
 5125                        if header_column not in fields_to_avoid:
 5126
 5127                            # Header info type
 5128                            header_info_type = "String"
 5129                            header_column_df = output_results_tsv_df[header_column]
 5130                            header_column_df_dtype = header_column_df.dtype
 5131                            if header_column_df_dtype == object:
 5132                                if (
 5133                                    pd.to_numeric(header_column_df, errors="coerce")
 5134                                    .notnull()
 5135                                    .all()
 5136                                ):
 5137                                    header_info_type = "Float"
 5138                            else:
 5139                                header_info_type = "Integer"
 5140
 5141                            # Header info
 5142                            characters_to_validate = ["-"]
 5143                            pattern = "[" + "".join(characters_to_validate) + "]"
 5144                            header_info_name = re.sub(
 5145                                pattern,
 5146                                "_",
 5147                                f"Exomiser_{header_column}".replace("#", ""),
 5148                            )
 5149                            header_info_number = "."
 5150                            header_info_description = (
 5151                                f"Exomiser {header_column} annotation"
 5152                            )
 5153                            header_info_source = "Exomiser"
 5154                            header_info_version = "unknown"
 5155                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5156                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5157                                header_info_name,
 5158                                header_info_number,
 5159                                header_info_type,
 5160                                header_info_description,
 5161                                header_info_source,
 5162                                header_info_version,
 5163                                header_info_code,
 5164                            )
 5165
 5166                            # Add field to add for update to concat fields
 5167                            sql_query_update_concat_fields.append(
 5168                                f"""
 5169                                CASE
 5170                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5171                                    THEN concat(
 5172                                        '{header_info_name}=',
 5173                                        table_parquet."{header_column}",
 5174                                        ';'
 5175                                        )
 5176
 5177                                    ELSE ''
 5178                                END
 5179                            """
 5180                            )
 5181
 5182                    # Update query
 5183                    sql_query_update = f"""
 5184                        UPDATE {table_variants} as table_variants
 5185                            SET INFO = concat(
 5186                                            CASE
 5187                                                WHEN INFO NOT IN ('', '.')
 5188                                                THEN INFO
 5189                                                ELSE ''
 5190                                            END,
 5191                                            CASE
 5192                                                WHEN table_variants.INFO NOT IN ('','.')
 5193                                                THEN ';'
 5194                                                ELSE ''
 5195                                            END,
 5196                                            (
 5197                                            SELECT 
 5198                                                concat(
 5199                                                    {",".join(sql_query_update_concat_fields)}
 5200                                                )
 5201                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5202                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5203                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5204                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5205                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5206                                            )
 5207                                        )
 5208                            ;
 5209                        """
 5210
 5211                    # Update
 5212                    self.conn.execute(sql_query_update)
 5213
 5214                ### Annotate with VCF INFO field ###
 5215
 5216                # Init result VCF file
 5217                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5218
 5219                # If VCF exists
 5220                if os.path.exists(output_results_vcf):
 5221
 5222                    # Log
 5223                    log.debug("Exomiser result VCF update variants")
 5224
 5225                    # Find Exomiser INFO field annotation in header
 5226                    with gzip.open(output_results_vcf, "rt") as f:
 5227                        header_list = self.read_vcf_header(f)
 5228                    exomiser_vcf_header = vcf.Reader(
 5229                        io.StringIO("\n".join(header_list))
 5230                    )
 5231
 5232                    # Add annotation INFO field to header
 5233                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5234
 5235                    # Update variants with VCF
 5236                    self.update_from_vcf(output_results_vcf)
 5237
 5238        return True
 5239
 5240    def annotation_snpeff(self, threads: int = None) -> None:
 5241        """
 5242        This function annotate with snpEff
 5243
 5244        :param threads: The number of threads to use
 5245        :return: the value of the variable "return_value".
 5246        """
 5247
 5248        # DEBUG
 5249        log.debug("Start annotation with snpeff databases")
 5250
 5251        # Threads
 5252        if not threads:
 5253            threads = self.get_threads()
 5254        log.debug("Threads: " + str(threads))
 5255
 5256        # DEBUG
 5257        delete_tmp = True
 5258        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5259            delete_tmp = False
 5260            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5261
 5262        # Config
 5263        config = self.get_config()
 5264        log.debug("Config: " + str(config))
 5265
 5266        # Config - Folders - Databases
 5267        databases_folders = (
 5268            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5269        )
 5270        log.debug("Databases annotations: " + str(databases_folders))
 5271
 5272        # Config - snpEff bin command
 5273        snpeff_bin_command = get_bin_command(
 5274            bin="snpEff.jar",
 5275            tool="snpeff",
 5276            bin_type="jar",
 5277            config=config,
 5278            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5279        )
 5280        if not snpeff_bin_command:
 5281            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5282            log.error(msg_err)
 5283            raise ValueError(msg_err)
 5284
 5285        # Config - snpEff databases
 5286        snpeff_databases = (
 5287            config.get("folders", {})
 5288            .get("databases", {})
 5289            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5290        )
 5291        snpeff_databases = full_path(snpeff_databases)
 5292        if snpeff_databases is not None and snpeff_databases != "":
 5293            log.debug(f"Create snpEff databases folder")
 5294            if not os.path.exists(snpeff_databases):
 5295                os.makedirs(snpeff_databases)
 5296
 5297        # Param
 5298        param = self.get_param()
 5299        log.debug("Param: " + str(param))
 5300
 5301        # Param
 5302        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5303        log.debug("Options: " + str(options))
 5304
 5305        # Param - Assembly
 5306        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5307
 5308        # Param - Options
 5309        snpeff_options = (
 5310            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5311        )
 5312        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5313        snpeff_csvstats = (
 5314            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5315        )
 5316        if snpeff_stats:
 5317            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5318            snpeff_stats = full_path(snpeff_stats)
 5319            snpeff_options += f" -stats {snpeff_stats}"
 5320        if snpeff_csvstats:
 5321            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5322            snpeff_csvstats = full_path(snpeff_csvstats)
 5323            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5324
 5325        # Data
 5326        table_variants = self.get_table_variants()
 5327
 5328        # Check if not empty
 5329        log.debug("Check if not empty")
 5330        sql_query_chromosomes = (
 5331            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5332        )
 5333        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5334        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5335            log.info(f"VCF empty")
 5336            return
 5337
 5338        # Export in VCF
 5339        log.debug("Create initial file to annotate")
 5340        tmp_vcf = NamedTemporaryFile(
 5341            prefix=self.get_prefix(),
 5342            dir=self.get_tmp_dir(),
 5343            suffix=".vcf.gz",
 5344            delete=True,
 5345        )
 5346        tmp_vcf_name = tmp_vcf.name
 5347
 5348        # VCF header
 5349        vcf_reader = self.get_header()
 5350        log.debug("Initial header: " + str(vcf_reader.infos))
 5351
 5352        # Existing annotations
 5353        for vcf_annotation in self.get_header().infos:
 5354
 5355            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5356            log.debug(
 5357                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5358            )
 5359
 5360        # Memory limit
 5361        # if config.get("memory", None):
 5362        #     memory_limit = config.get("memory", "8G")
 5363        # else:
 5364        #     memory_limit = "8G"
 5365        memory_limit = self.get_memory("8G")
 5366        log.debug(f"memory_limit: {memory_limit}")
 5367
 5368        # snpEff java options
 5369        snpeff_java_options = (
 5370            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5371        )
 5372        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5373
 5374        force_update_annotation = True
 5375
 5376        if "ANN" not in self.get_header().infos or force_update_annotation:
 5377
 5378            # Check snpEff database
 5379            log.debug(f"Check snpEff databases {[assembly]}")
 5380            databases_download_snpeff(
 5381                folder=snpeff_databases, assemblies=[assembly], config=config
 5382            )
 5383
 5384            # Export VCF file
 5385            self.export_variant_vcf(
 5386                vcf_file=tmp_vcf_name,
 5387                remove_info=True,
 5388                add_samples=False,
 5389                index=True,
 5390            )
 5391
 5392            # Tmp file
 5393            err_files = []
 5394            tmp_annotate_vcf = NamedTemporaryFile(
 5395                prefix=self.get_prefix(),
 5396                dir=self.get_tmp_dir(),
 5397                suffix=".vcf",
 5398                delete=False,
 5399            )
 5400            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5401            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5402            err_files.append(tmp_annotate_vcf_name_err)
 5403
 5404            # Command
 5405            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5406            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5407            run_parallel_commands([snpeff_command], 1)
 5408
 5409            # Error messages
 5410            log.info(f"Error/Warning messages:")
 5411            error_message_command_all = []
 5412            error_message_command_warning = []
 5413            error_message_command_err = []
 5414            for err_file in err_files:
 5415                with open(err_file, "r") as f:
 5416                    for line in f:
 5417                        message = line.strip()
 5418                        error_message_command_all.append(message)
 5419                        if line.startswith("[W::"):
 5420                            error_message_command_warning.append(message)
 5421                        if line.startswith("[E::"):
 5422                            error_message_command_err.append(f"{err_file}: " + message)
 5423            # log info
 5424            for message in list(
 5425                set(error_message_command_err + error_message_command_warning)
 5426            ):
 5427                log.info(f"   {message}")
 5428            # debug info
 5429            for message in list(set(error_message_command_all)):
 5430                log.debug(f"   {message}")
 5431            # failed
 5432            if len(error_message_command_err):
 5433                log.error("Annotation failed: Error in commands")
 5434                raise ValueError("Annotation failed: Error in commands")
 5435
 5436            # Find annotation in header
 5437            with open(tmp_annotate_vcf_name, "rt") as f:
 5438                header_list = self.read_vcf_header(f)
 5439            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5440
 5441            for ann in annovar_vcf_header.infos:
 5442                if ann not in self.get_header().infos:
 5443                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5444
 5445            # Update variants
 5446            log.info(f"Annotation - Updating...")
 5447            self.update_from_vcf(tmp_annotate_vcf_name)
 5448
 5449        else:
 5450            if "ANN" in self.get_header().infos:
 5451                log.debug(f"Existing snpEff annotations in VCF")
 5452            if force_update_annotation:
 5453                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5454
 5455    def annotation_annovar(self, threads: int = None) -> None:
 5456        """
 5457        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5458        annotations
 5459
 5460        :param threads: number of threads to use
 5461        :return: the value of the variable "return_value".
 5462        """
 5463
 5464        # DEBUG
 5465        log.debug("Start annotation with Annovar databases")
 5466
 5467        # Threads
 5468        if not threads:
 5469            threads = self.get_threads()
 5470        log.debug("Threads: " + str(threads))
 5471
 5472        # Tmp en Err files
 5473        tmp_files = []
 5474        err_files = []
 5475
 5476        # DEBUG
 5477        delete_tmp = True
 5478        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5479            delete_tmp = False
 5480            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5481
 5482        # Config
 5483        config = self.get_config()
 5484        log.debug("Config: " + str(config))
 5485
 5486        # Config - Folders - Databases
 5487        databases_folders = (
 5488            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5489        )
 5490        log.debug("Databases annotations: " + str(databases_folders))
 5491
 5492        # Config - annovar bin command
 5493        annovar_bin_command = get_bin_command(
 5494            bin="table_annovar.pl",
 5495            tool="annovar",
 5496            bin_type="perl",
 5497            config=config,
 5498            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5499        )
 5500        if not annovar_bin_command:
 5501            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5502            log.error(msg_err)
 5503            raise ValueError(msg_err)
 5504
 5505        # Config - BCFTools bin command
 5506        bcftools_bin_command = get_bin_command(
 5507            bin="bcftools",
 5508            tool="bcftools",
 5509            bin_type="bin",
 5510            config=config,
 5511            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5512        )
 5513        if not bcftools_bin_command:
 5514            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5515            log.error(msg_err)
 5516            raise ValueError(msg_err)
 5517
 5518        # Config - annovar databases
 5519        annovar_databases = (
 5520            config.get("folders", {})
 5521            .get("databases", {})
 5522            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5523        )
 5524        if annovar_databases is not None:
 5525            if isinstance(annovar_databases, list):
 5526                annovar_databases = full_path(annovar_databases[0])
 5527                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5528            annovar_databases = full_path(annovar_databases)
 5529            if not os.path.exists(annovar_databases):
 5530                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5531                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5532        else:
 5533            msg_err = f"Annovar databases configuration failed"
 5534            log.error(msg_err)
 5535            raise ValueError(msg_err)
 5536
 5537        # Param
 5538        param = self.get_param()
 5539        log.debug("Param: " + str(param))
 5540
 5541        # Param - options
 5542        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5543        log.debug("Options: " + str(options))
 5544
 5545        # Param - annotations
 5546        annotations = (
 5547            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5548        )
 5549        log.debug("Annotations: " + str(annotations))
 5550
 5551        # Param - Assembly
 5552        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5553
 5554        # Annovar database assembly
 5555        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5556        if annovar_databases_assembly != "" and not os.path.exists(
 5557            annovar_databases_assembly
 5558        ):
 5559            os.makedirs(annovar_databases_assembly)
 5560
 5561        # Data
 5562        table_variants = self.get_table_variants()
 5563
 5564        # Check if not empty
 5565        log.debug("Check if not empty")
 5566        sql_query_chromosomes = (
 5567            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5568        )
 5569        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5570        if not sql_query_chromosomes_df["count"][0]:
 5571            log.info(f"VCF empty")
 5572            return
 5573
 5574        # VCF header
 5575        vcf_reader = self.get_header()
 5576        log.debug("Initial header: " + str(vcf_reader.infos))
 5577
 5578        # Existing annotations
 5579        for vcf_annotation in self.get_header().infos:
 5580
 5581            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5582            log.debug(
 5583                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5584            )
 5585
 5586        force_update_annotation = True
 5587
 5588        if annotations:
 5589
 5590            commands = []
 5591            tmp_annotates_vcf_name_list = []
 5592
 5593            # Export in VCF
 5594            log.debug("Create initial file to annotate")
 5595            tmp_vcf = NamedTemporaryFile(
 5596                prefix=self.get_prefix(),
 5597                dir=self.get_tmp_dir(),
 5598                suffix=".vcf.gz",
 5599                delete=False,
 5600            )
 5601            tmp_vcf_name = tmp_vcf.name
 5602            tmp_files.append(tmp_vcf_name)
 5603            tmp_files.append(tmp_vcf_name + ".tbi")
 5604
 5605            # Export VCF file
 5606            self.export_variant_vcf(
 5607                vcf_file=tmp_vcf_name,
 5608                remove_info=".",
 5609                add_samples=False,
 5610                index=True,
 5611            )
 5612
 5613            # Create file for field rename
 5614            log.debug("Create file for field rename")
 5615            tmp_rename = NamedTemporaryFile(
 5616                prefix=self.get_prefix(),
 5617                dir=self.get_tmp_dir(),
 5618                suffix=".rename",
 5619                delete=False,
 5620            )
 5621            tmp_rename_name = tmp_rename.name
 5622            tmp_files.append(tmp_rename_name)
 5623
 5624            # Check Annovar database
 5625            log.debug(
 5626                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5627            )
 5628            databases_download_annovar(
 5629                folder=annovar_databases,
 5630                files=list(annotations.keys()),
 5631                assemblies=[assembly],
 5632            )
 5633
 5634            for annotation in annotations:
 5635                annotation_fields = annotations[annotation]
 5636
 5637                if not annotation_fields:
 5638                    annotation_fields = {"INFO": None}
 5639
 5640                log.info(f"Annotations Annovar - database '{annotation}'")
 5641                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5642
 5643                # Tmp file for annovar
 5644                err_files = []
 5645                tmp_annotate_vcf_directory = TemporaryDirectory(
 5646                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5647                )
 5648                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5649                tmp_annotate_vcf_name_annovar = (
 5650                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5651                )
 5652                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5653                err_files.append(tmp_annotate_vcf_name_err)
 5654                tmp_files.append(tmp_annotate_vcf_name_err)
 5655
 5656                # Tmp file final vcf annotated by annovar
 5657                tmp_annotate_vcf = NamedTemporaryFile(
 5658                    prefix=self.get_prefix(),
 5659                    dir=self.get_tmp_dir(),
 5660                    suffix=".vcf.gz",
 5661                    delete=False,
 5662                )
 5663                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5664                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5665                tmp_files.append(tmp_annotate_vcf_name)
 5666                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5667
 5668                # Number of fields
 5669                annotation_list = []
 5670                annotation_renamed_list = []
 5671
 5672                for annotation_field in annotation_fields:
 5673
 5674                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5675                    annotation_fields_new_name = annotation_fields.get(
 5676                        annotation_field, annotation_field
 5677                    )
 5678                    if not annotation_fields_new_name:
 5679                        annotation_fields_new_name = annotation_field
 5680
 5681                    if (
 5682                        force_update_annotation
 5683                        or annotation_fields_new_name not in self.get_header().infos
 5684                    ):
 5685                        annotation_list.append(annotation_field)
 5686                        annotation_renamed_list.append(annotation_fields_new_name)
 5687                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5688                        log.warning(
 5689                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5690                        )
 5691
 5692                    # Add rename info
 5693                    run_parallel_commands(
 5694                        [
 5695                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5696                        ],
 5697                        1,
 5698                    )
 5699
 5700                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5701                log.debug("annotation_list: " + str(annotation_list))
 5702
 5703                # protocol
 5704                protocol = annotation
 5705
 5706                # argument
 5707                argument = ""
 5708
 5709                # operation
 5710                operation = "f"
 5711                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5712                    "ensGene"
 5713                ):
 5714                    operation = "g"
 5715                    if options.get("genebase", None):
 5716                        argument = f"""'{options.get("genebase","")}'"""
 5717                elif annotation in ["cytoBand"]:
 5718                    operation = "r"
 5719
 5720                # argument option
 5721                argument_option = ""
 5722                if argument != "":
 5723                    argument_option = " --argument " + argument
 5724
 5725                # command options
 5726                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5727                for option in options:
 5728                    if option not in ["genebase"]:
 5729                        command_options += f""" --{option}={options[option]}"""
 5730
 5731                # Command
 5732
 5733                # Command - Annovar
 5734                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5735                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5736
 5737                # Command - start pipe
 5738                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5739
 5740                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5741                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5742
 5743                # Command - Special characters (refGene annotation)
 5744                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5745
 5746                # Command - Clean empty fields (with value ".")
 5747                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5748
 5749                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5750                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5751                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5752                    # for ann in annotation_renamed_list:
 5753                    for ann in annotation_list:
 5754                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5755
 5756                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5757
 5758                # Command - indexing
 5759                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5760
 5761                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5762                run_parallel_commands([command_annovar], 1)
 5763
 5764                # Error messages
 5765                log.info(f"Error/Warning messages:")
 5766                error_message_command_all = []
 5767                error_message_command_warning = []
 5768                error_message_command_err = []
 5769                for err_file in err_files:
 5770                    with open(err_file, "r") as f:
 5771                        for line in f:
 5772                            message = line.strip()
 5773                            error_message_command_all.append(message)
 5774                            if line.startswith("[W::") or line.startswith("WARNING"):
 5775                                error_message_command_warning.append(message)
 5776                            if line.startswith("[E::") or line.startswith("ERROR"):
 5777                                error_message_command_err.append(
 5778                                    f"{err_file}: " + message
 5779                                )
 5780                # log info
 5781                for message in list(
 5782                    set(error_message_command_err + error_message_command_warning)
 5783                ):
 5784                    log.info(f"   {message}")
 5785                # debug info
 5786                for message in list(set(error_message_command_all)):
 5787                    log.debug(f"   {message}")
 5788                # failed
 5789                if len(error_message_command_err):
 5790                    log.error("Annotation failed: Error in commands")
 5791                    raise ValueError("Annotation failed: Error in commands")
 5792
 5793            if tmp_annotates_vcf_name_list:
 5794
 5795                # List of annotated files
 5796                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5797
 5798                # Tmp file
 5799                tmp_annotate_vcf = NamedTemporaryFile(
 5800                    prefix=self.get_prefix(),
 5801                    dir=self.get_tmp_dir(),
 5802                    suffix=".vcf.gz",
 5803                    delete=False,
 5804                )
 5805                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5806                tmp_files.append(tmp_annotate_vcf_name)
 5807                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5808                err_files.append(tmp_annotate_vcf_name_err)
 5809                tmp_files.append(tmp_annotate_vcf_name_err)
 5810
 5811                # Command merge
 5812                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5813                log.info(
 5814                    f"Annotation Annovar - Annotation merging "
 5815                    + str(len(tmp_annotates_vcf_name_list))
 5816                    + " annotated files"
 5817                )
 5818                log.debug(f"Annotation - merge command: {merge_command}")
 5819                run_parallel_commands([merge_command], 1)
 5820
 5821                # Find annotation in header
 5822                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5823                    header_list = self.read_vcf_header(f)
 5824                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5825
 5826                for ann in annovar_vcf_header.infos:
 5827                    if ann not in self.get_header().infos:
 5828                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5829
 5830                # Update variants
 5831                log.info(f"Annotation Annovar - Updating...")
 5832                self.update_from_vcf(tmp_annotate_vcf_name)
 5833
 5834            # Clean files
 5835            # Tmp file remove command
 5836            if True:
 5837                tmp_files_remove_command = ""
 5838                if tmp_files:
 5839                    tmp_files_remove_command = " ".join(tmp_files)
 5840                clean_command = f" rm -f {tmp_files_remove_command} "
 5841                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5842                log.debug(f"Annotation - cleaning command: {clean_command}")
 5843                run_parallel_commands([clean_command], 1)
 5844
 5845    # Parquet
 5846    def annotation_parquet(self, threads: int = None) -> None:
 5847        """
 5848        It takes a VCF file, and annotates it with a parquet file
 5849
 5850        :param threads: number of threads to use for the annotation
 5851        :return: the value of the variable "result".
 5852        """
 5853
 5854        # DEBUG
 5855        log.debug("Start annotation with parquet databases")
 5856
 5857        # Threads
 5858        if not threads:
 5859            threads = self.get_threads()
 5860        log.debug("Threads: " + str(threads))
 5861
 5862        # DEBUG
 5863        delete_tmp = True
 5864        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5865            delete_tmp = False
 5866            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5867
 5868        # Config
 5869        databases_folders = set(
 5870            self.get_config()
 5871            .get("folders", {})
 5872            .get("databases", {})
 5873            .get("annotations", ["."])
 5874            + self.get_config()
 5875            .get("folders", {})
 5876            .get("databases", {})
 5877            .get("parquet", ["."])
 5878        )
 5879        log.debug("Databases annotations: " + str(databases_folders))
 5880
 5881        # Param
 5882        annotations = (
 5883            self.get_param()
 5884            .get("annotation", {})
 5885            .get("parquet", {})
 5886            .get("annotations", None)
 5887        )
 5888        log.debug("Annotations: " + str(annotations))
 5889
 5890        # Assembly
 5891        assembly = self.get_param().get(
 5892            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5893        )
 5894
 5895        # Force Update Annotation
 5896        force_update_annotation = (
 5897            self.get_param()
 5898            .get("annotation", {})
 5899            .get("options", {})
 5900            .get("annotations_update", False)
 5901        )
 5902        log.debug(f"force_update_annotation={force_update_annotation}")
 5903        force_append_annotation = (
 5904            self.get_param()
 5905            .get("annotation", {})
 5906            .get("options", {})
 5907            .get("annotations_append", False)
 5908        )
 5909        log.debug(f"force_append_annotation={force_append_annotation}")
 5910
 5911        # Data
 5912        table_variants = self.get_table_variants()
 5913
 5914        # Check if not empty
 5915        log.debug("Check if not empty")
 5916        sql_query_chromosomes_df = self.get_query_to_df(
 5917            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5918        )
 5919        if not sql_query_chromosomes_df["count"][0]:
 5920            log.info(f"VCF empty")
 5921            return
 5922
 5923        # VCF header
 5924        vcf_reader = self.get_header()
 5925        log.debug("Initial header: " + str(vcf_reader.infos))
 5926
 5927        # Nb Variants POS
 5928        log.debug("NB Variants Start")
 5929        nb_variants = self.conn.execute(
 5930            f"SELECT count(*) AS count FROM variants"
 5931        ).fetchdf()["count"][0]
 5932        log.debug("NB Variants Stop")
 5933
 5934        # Existing annotations
 5935        for vcf_annotation in self.get_header().infos:
 5936
 5937            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5938            log.debug(
 5939                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5940            )
 5941
 5942        # Added columns
 5943        added_columns = []
 5944
 5945        # drop indexes
 5946        log.debug(f"Drop indexes...")
 5947        self.drop_indexes()
 5948
 5949        if annotations:
 5950
 5951            if "ALL" in annotations:
 5952
 5953                all_param = annotations.get("ALL", {})
 5954                all_param_formats = all_param.get("formats", None)
 5955                all_param_releases = all_param.get("releases", None)
 5956
 5957                databases_infos_dict = self.scan_databases(
 5958                    database_formats=all_param_formats,
 5959                    database_releases=all_param_releases,
 5960                )
 5961                for database_infos in databases_infos_dict.keys():
 5962                    if database_infos not in annotations:
 5963                        annotations[database_infos] = {"INFO": None}
 5964
 5965            for annotation in annotations:
 5966
 5967                if annotation in ["ALL"]:
 5968                    continue
 5969
 5970                # Annotation Name
 5971                annotation_name = os.path.basename(annotation)
 5972
 5973                # Annotation fields
 5974                annotation_fields = annotations[annotation]
 5975                if not annotation_fields:
 5976                    annotation_fields = {"INFO": None}
 5977
 5978                log.debug(f"Annotation '{annotation_name}'")
 5979                log.debug(
 5980                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5981                )
 5982
 5983                # Create Database
 5984                database = Database(
 5985                    database=annotation,
 5986                    databases_folders=databases_folders,
 5987                    assembly=assembly,
 5988                )
 5989
 5990                # Find files
 5991                parquet_file = database.get_database()
 5992                parquet_hdr_file = database.get_header_file()
 5993                parquet_type = database.get_type()
 5994
 5995                # Check if files exists
 5996                if not parquet_file or not parquet_hdr_file:
 5997                    msg_err_list = []
 5998                    if not parquet_file:
 5999                        msg_err_list.append(
 6000                            f"Annotation failed: Annotation file not found"
 6001                        )
 6002                    if parquet_file and not parquet_hdr_file:
 6003                        msg_err_list.append(
 6004                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 6005                        )
 6006
 6007                    log.error(". ".join(msg_err_list))
 6008                    raise ValueError(". ".join(msg_err_list))
 6009                else:
 6010                    # Get parquet connexion
 6011                    parquet_sql_attach = database.get_sql_database_attach(
 6012                        output="query"
 6013                    )
 6014                    if parquet_sql_attach:
 6015                        self.conn.execute(parquet_sql_attach)
 6016                    parquet_file_link = database.get_sql_database_link()
 6017                    # Log
 6018                    log.debug(
 6019                        f"Annotation '{annotation_name}' - file: "
 6020                        + str(parquet_file)
 6021                        + " and "
 6022                        + str(parquet_hdr_file)
 6023                    )
 6024
 6025                    # Database full header columns
 6026                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6027                        parquet_hdr_file
 6028                    )
 6029                    # Log
 6030                    log.debug(
 6031                        "Annotation database header columns : "
 6032                        + str(parquet_hdr_vcf_header_columns)
 6033                    )
 6034
 6035                    # Load header as VCF object
 6036                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6037                    # Log
 6038                    log.debug(
 6039                        "Annotation database header: "
 6040                        + str(parquet_hdr_vcf_header_infos)
 6041                    )
 6042
 6043                    # Get extra infos
 6044                    parquet_columns = database.get_extra_columns()
 6045                    # Log
 6046                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6047
 6048                    # Add extra columns if "ALL" in annotation_fields
 6049                    # if "ALL" in annotation_fields:
 6050                    #     allow_add_extra_column = True
 6051                    if "ALL" in annotation_fields and database.get_extra_columns():
 6052                        for extra_column in database.get_extra_columns():
 6053                            if (
 6054                                extra_column not in annotation_fields
 6055                                and extra_column.replace("INFO/", "")
 6056                                not in parquet_hdr_vcf_header_infos
 6057                            ):
 6058                                parquet_hdr_vcf_header_infos[extra_column] = (
 6059                                    vcf.parser._Info(
 6060                                        extra_column,
 6061                                        ".",
 6062                                        "String",
 6063                                        f"{extra_column} description",
 6064                                        "unknown",
 6065                                        "unknown",
 6066                                        self.code_type_map["String"],
 6067                                    )
 6068                                )
 6069
 6070                    # For all fields in database
 6071                    annotation_fields_all = False
 6072                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6073                        annotation_fields_all = True
 6074                        annotation_fields = {
 6075                            key: key for key in parquet_hdr_vcf_header_infos
 6076                        }
 6077
 6078                        log.debug(
 6079                            "Annotation database header - All annotations added: "
 6080                            + str(annotation_fields)
 6081                        )
 6082
 6083                    # Init
 6084
 6085                    # List of annotation fields to use
 6086                    sql_query_annotation_update_info_sets = []
 6087
 6088                    # List of annotation to agregate
 6089                    sql_query_annotation_to_agregate = []
 6090
 6091                    # Number of fields
 6092                    nb_annotation_field = 0
 6093
 6094                    # Annotation fields processed
 6095                    annotation_fields_processed = []
 6096
 6097                    # Columns mapping
 6098                    map_columns = database.map_columns(
 6099                        columns=annotation_fields, prefixes=["INFO/"]
 6100                    )
 6101
 6102                    # Query dict for fields to remove (update option)
 6103                    query_dict_remove = {}
 6104
 6105                    # Fetch Anotation fields
 6106                    for annotation_field in annotation_fields:
 6107
 6108                        # annotation_field_column
 6109                        annotation_field_column = map_columns.get(
 6110                            annotation_field, "INFO"
 6111                        )
 6112
 6113                        # field new name, if parametered
 6114                        annotation_fields_new_name = annotation_fields.get(
 6115                            annotation_field, annotation_field
 6116                        )
 6117                        if not annotation_fields_new_name:
 6118                            annotation_fields_new_name = annotation_field
 6119
 6120                        # To annotate
 6121                        # force_update_annotation = True
 6122                        # force_append_annotation = True
 6123                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6124                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6125                            force_update_annotation
 6126                            or force_append_annotation
 6127                            or (
 6128                                annotation_fields_new_name
 6129                                not in self.get_header().infos
 6130                            )
 6131                        ):
 6132
 6133                            # Add field to annotation to process list
 6134                            annotation_fields_processed.append(
 6135                                annotation_fields_new_name
 6136                            )
 6137
 6138                            # explode infos for the field
 6139                            annotation_fields_new_name_info_msg = ""
 6140                            if (
 6141                                force_update_annotation
 6142                                and annotation_fields_new_name
 6143                                in self.get_header().infos
 6144                            ):
 6145                                # Remove field from INFO
 6146                                query = f"""
 6147                                    UPDATE {table_variants} as table_variants
 6148                                    SET INFO = REGEXP_REPLACE(
 6149                                                concat(table_variants.INFO,''),
 6150                                                ';*{annotation_fields_new_name}=[^;]*',
 6151                                                ''
 6152                                                )
 6153                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6154                                """
 6155                                annotation_fields_new_name_info_msg = " [update]"
 6156                                query_dict_remove[
 6157                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6158                                ] = query
 6159
 6160                            # Sep between fields in INFO
 6161                            nb_annotation_field += 1
 6162                            if nb_annotation_field > 1:
 6163                                annotation_field_sep = ";"
 6164                            else:
 6165                                annotation_field_sep = ""
 6166
 6167                            log.info(
 6168                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6169                            )
 6170
 6171                            # Add INFO field to header
 6172                            parquet_hdr_vcf_header_infos_number = (
 6173                                parquet_hdr_vcf_header_infos[annotation_field].num
 6174                                or "."
 6175                            )
 6176                            parquet_hdr_vcf_header_infos_type = (
 6177                                parquet_hdr_vcf_header_infos[annotation_field].type
 6178                                or "String"
 6179                            )
 6180                            parquet_hdr_vcf_header_infos_description = (
 6181                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6182                                or f"{annotation_field} description"
 6183                            )
 6184                            parquet_hdr_vcf_header_infos_source = (
 6185                                parquet_hdr_vcf_header_infos[annotation_field].source
 6186                                or "unknown"
 6187                            )
 6188                            parquet_hdr_vcf_header_infos_version = (
 6189                                parquet_hdr_vcf_header_infos[annotation_field].version
 6190                                or "unknown"
 6191                            )
 6192
 6193                            vcf_reader.infos[annotation_fields_new_name] = (
 6194                                vcf.parser._Info(
 6195                                    annotation_fields_new_name,
 6196                                    parquet_hdr_vcf_header_infos_number,
 6197                                    parquet_hdr_vcf_header_infos_type,
 6198                                    parquet_hdr_vcf_header_infos_description,
 6199                                    parquet_hdr_vcf_header_infos_source,
 6200                                    parquet_hdr_vcf_header_infos_version,
 6201                                    self.code_type_map[
 6202                                        parquet_hdr_vcf_header_infos_type
 6203                                    ],
 6204                                )
 6205                            )
 6206
 6207                            # Append
 6208                            if force_append_annotation:
 6209                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6210                            else:
 6211                                query_case_when_append = ""
 6212
 6213                            # Annotation/Update query fields
 6214                            # Found in INFO column
 6215                            if (
 6216                                annotation_field_column == "INFO"
 6217                                and "INFO" in parquet_hdr_vcf_header_columns
 6218                            ):
 6219                                sql_query_annotation_update_info_sets.append(
 6220                                    f"""
 6221                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6222                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6223                                        ELSE ''
 6224                                    END
 6225                                """
 6226                                )
 6227                            # Found in a specific column
 6228                            else:
 6229                                sql_query_annotation_update_info_sets.append(
 6230                                    f"""
 6231                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6232                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6233                                        ELSE ''
 6234                                    END
 6235                                """
 6236                                )
 6237                                sql_query_annotation_to_agregate.append(
 6238                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6239                                )
 6240
 6241                        # Not to annotate
 6242                        else:
 6243
 6244                            if force_update_annotation:
 6245                                annotation_message = "forced"
 6246                            else:
 6247                                annotation_message = "skipped"
 6248
 6249                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6250                                log.warning(
 6251                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6252                                )
 6253                            if annotation_fields_new_name in self.get_header().infos:
 6254                                log.warning(
 6255                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6256                                )
 6257
 6258                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6259                    # allow_annotation_full_info = True
 6260                    allow_annotation_full_info = not force_append_annotation
 6261
 6262                    if parquet_type in ["regions"]:
 6263                        allow_annotation_full_info = False
 6264
 6265                    if (
 6266                        allow_annotation_full_info
 6267                        and nb_annotation_field == len(annotation_fields)
 6268                        and annotation_fields_all
 6269                        and (
 6270                            "INFO" in parquet_hdr_vcf_header_columns
 6271                            and "INFO" in database.get_extra_columns()
 6272                        )
 6273                    ):
 6274                        log.debug("Column INFO annotation enabled")
 6275                        sql_query_annotation_update_info_sets = []
 6276                        sql_query_annotation_update_info_sets.append(
 6277                            f" table_parquet.INFO "
 6278                        )
 6279
 6280                    if sql_query_annotation_update_info_sets:
 6281
 6282                        # Annotate
 6283                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6284
 6285                        # Join query annotation update info sets for SQL
 6286                        sql_query_annotation_update_info_sets_sql = ",".join(
 6287                            sql_query_annotation_update_info_sets
 6288                        )
 6289
 6290                        # Check chromosomes list (and variants infos)
 6291                        sql_query_chromosomes = f"""
 6292                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6293                            FROM {table_variants} as table_variants
 6294                            GROUP BY table_variants."#CHROM"
 6295                            ORDER BY table_variants."#CHROM"
 6296                            """
 6297                        sql_query_chromosomes_df = self.conn.execute(
 6298                            sql_query_chromosomes
 6299                        ).df()
 6300                        sql_query_chromosomes_dict = {
 6301                            entry["CHROM"]: {
 6302                                "count": entry["count_variants"],
 6303                                "min": entry["min_variants"],
 6304                                "max": entry["max_variants"],
 6305                            }
 6306                            for index, entry in sql_query_chromosomes_df.iterrows()
 6307                        }
 6308
 6309                        # Init
 6310                        nb_of_query = 0
 6311                        nb_of_variant_annotated = 0
 6312                        query_dict = query_dict_remove
 6313
 6314                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6315                        for chrom in sql_query_chromosomes_dict:
 6316
 6317                            # Number of variant by chromosome
 6318                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6319                                chrom, {}
 6320                            ).get("count", 0)
 6321
 6322                            log.debug(
 6323                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6324                            )
 6325
 6326                            # Annotation with regions database
 6327                            if parquet_type in ["regions"]:
 6328                                sql_query_annotation_from_clause = f"""
 6329                                    FROM (
 6330                                        SELECT 
 6331                                            '{chrom}' AS \"#CHROM\",
 6332                                            table_variants_from.\"POS\" AS \"POS\",
 6333                                            {",".join(sql_query_annotation_to_agregate)}
 6334                                        FROM {table_variants} as table_variants_from
 6335                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6336                                            table_parquet_from."#CHROM" = '{chrom}'
 6337                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6338                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6339                                        )
 6340                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6341                                        GROUP BY table_variants_from.\"POS\"
 6342                                        )
 6343                                        as table_parquet
 6344                                """
 6345
 6346                                sql_query_annotation_where_clause = """
 6347                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6348                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6349                                """
 6350
 6351                            # Annotation with variants database
 6352                            else:
 6353                                sql_query_annotation_from_clause = f"""
 6354                                    FROM {parquet_file_link} as table_parquet
 6355                                """
 6356                                sql_query_annotation_where_clause = f"""
 6357                                    table_variants."#CHROM" = '{chrom}'
 6358                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6359                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6360                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6361                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6362                                """
 6363
 6364                            # Create update query
 6365                            sql_query_annotation_chrom_interval_pos = f"""
 6366                                UPDATE {table_variants} as table_variants
 6367                                    SET INFO = 
 6368                                        concat(
 6369                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6370                                                THEN table_variants.INFO
 6371                                                ELSE ''
 6372                                            END
 6373                                            ,
 6374                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6375                                                        AND (
 6376                                                        concat({sql_query_annotation_update_info_sets_sql})
 6377                                                        )
 6378                                                        NOT IN ('','.') 
 6379                                                    THEN ';'
 6380                                                    ELSE ''
 6381                                            END
 6382                                            ,
 6383                                            {sql_query_annotation_update_info_sets_sql}
 6384                                            )
 6385                                    {sql_query_annotation_from_clause}
 6386                                    WHERE {sql_query_annotation_where_clause}
 6387                                    ;
 6388                                """
 6389
 6390                            # Add update query to dict
 6391                            query_dict[
 6392                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6393                            ] = sql_query_annotation_chrom_interval_pos
 6394
 6395                        nb_of_query = len(query_dict)
 6396                        num_query = 0
 6397
 6398                        # SET max_expression_depth TO x
 6399                        self.conn.execute("SET max_expression_depth TO 10000")
 6400
 6401                        for query_name in query_dict:
 6402                            query = query_dict[query_name]
 6403                            num_query += 1
 6404                            log.info(
 6405                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6406                            )
 6407                            result = self.conn.execute(query)
 6408                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6409                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6410                            log.info(
 6411                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6412                            )
 6413
 6414                        log.info(
 6415                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6416                        )
 6417
 6418                    else:
 6419
 6420                        log.info(
 6421                            f"Annotation '{annotation_name}' - No Annotations available"
 6422                        )
 6423
 6424                    log.debug("Final header: " + str(vcf_reader.infos))
 6425
 6426        # Remove added columns
 6427        for added_column in added_columns:
 6428            self.drop_column(column=added_column)
 6429
 6430    def annotation_splice(self, threads: int = None) -> None:
 6431        """
 6432        This function annotate with snpEff
 6433
 6434        :param threads: The number of threads to use
 6435        :return: the value of the variable "return_value".
 6436        """
 6437
 6438        # DEBUG
 6439        log.debug("Start annotation with splice tools")
 6440
 6441        # Threads
 6442        if not threads:
 6443            threads = self.get_threads()
 6444        log.debug("Threads: " + str(threads))
 6445
 6446        # DEBUG
 6447        delete_tmp = True
 6448        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6449            delete_tmp = False
 6450            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6451
 6452        # Config
 6453        config = self.get_config()
 6454        log.debug("Config: " + str(config))
 6455        splice_config = config.get("tools", {}).get("splice", {})
 6456        if not splice_config:
 6457            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6458            msg_err = "No Splice tool config"
 6459            raise ValueError(msg_err)
 6460        log.debug(f"splice_config: {splice_config}")
 6461
 6462        # Config - Folders - Databases
 6463        databases_folders = (
 6464            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6465        )
 6466        log.debug("Databases annotations: " + str(databases_folders))
 6467
 6468        # Splice docker image
 6469        splice_docker_image = splice_config.get("docker").get("image")
 6470
 6471        # Pull splice image if it's not already there
 6472        if not check_docker_image_exists(splice_docker_image):
 6473            log.warning(
 6474                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6475            )
 6476            try:
 6477                command(f"docker pull {splice_config.get('docker').get('image')}")
 6478            except subprocess.CalledProcessError:
 6479                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6480                log.error(msg_err)
 6481                raise ValueError(msg_err)
 6482
 6483        # Config - splice databases
 6484        splice_databases = (
 6485            config.get("folders", {})
 6486            .get("databases", {})
 6487            .get("splice", DEFAULT_SPLICE_FOLDER)
 6488        )
 6489        splice_databases = full_path(splice_databases)
 6490
 6491        # Param
 6492        param = self.get_param()
 6493        log.debug("Param: " + str(param))
 6494
 6495        # Param
 6496        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6497        log.debug("Options: " + str(options))
 6498
 6499        # Data
 6500        table_variants = self.get_table_variants()
 6501
 6502        # Check if not empty
 6503        log.debug("Check if not empty")
 6504        sql_query_chromosomes = (
 6505            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6506        )
 6507        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6508            log.info("VCF empty")
 6509            return None
 6510
 6511        # Export in VCF
 6512        log.debug("Create initial file to annotate")
 6513
 6514        # Create output folder / work folder
 6515        if options.get("output_folder", ""):
 6516            output_folder = options.get("output_folder", "")
 6517            if not os.path.exists(output_folder):
 6518                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6519        else:
 6520            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6521            if not os.path.exists(output_folder):
 6522                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6523
 6524        if options.get("workdir", ""):
 6525            workdir = options.get("workdir", "")
 6526        else:
 6527            workdir = "/work"
 6528
 6529        # Create tmp VCF file
 6530        tmp_vcf = NamedTemporaryFile(
 6531            prefix=self.get_prefix(),
 6532            dir=output_folder,
 6533            suffix=".vcf",
 6534            delete=False,
 6535        )
 6536        tmp_vcf_name = tmp_vcf.name
 6537
 6538        # VCF header
 6539        header = self.get_header()
 6540
 6541        # Existing annotations
 6542        for vcf_annotation in self.get_header().infos:
 6543
 6544            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6545            log.debug(
 6546                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6547            )
 6548
 6549        # Memory limit
 6550        if config.get("memory", None):
 6551            memory_limit = config.get("memory", "8G").upper()
 6552            # upper()
 6553        else:
 6554            memory_limit = "8G"
 6555        log.debug(f"memory_limit: {memory_limit}")
 6556
 6557        # Check number of variants to annotate
 6558        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6559        where_clause_regex_spip = r"SPiP_\w+"
 6560        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6561        df_list_of_variants_to_annotate = self.get_query_to_df(
 6562            query=f""" SELECT * FROM variants {where_clause} """
 6563        )
 6564        if len(df_list_of_variants_to_annotate) == 0:
 6565            log.warning(
 6566                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6567            )
 6568            return None
 6569        else:
 6570            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6571
 6572        # Export VCF file
 6573        self.export_variant_vcf(
 6574            vcf_file=tmp_vcf_name,
 6575            remove_info=True,
 6576            add_samples=True,
 6577            index=False,
 6578            where_clause=where_clause,
 6579        )
 6580        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6581        if any(value for value in splice_config.values() if value is None):
 6582            log.warning("At least one splice config parameter is empty")
 6583            # exit annotation_splice
 6584            return None
 6585
 6586        # Params in splice nf
 6587        def check_values(dico: dict):
 6588            """
 6589            Ensure parameters for NF splice pipeline
 6590            """
 6591            for key, val in dico.items():
 6592                if key == "genome":
 6593                    if any(
 6594                        assemb in options.get("genome", {})
 6595                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6596                    ):
 6597                        yield f"--{key} hg19"
 6598                    elif any(
 6599                        assemb in options.get("genome", {})
 6600                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6601                    ):
 6602                        yield f"--{key} hg38"
 6603                elif (
 6604                    (isinstance(val, str) and val)
 6605                    or isinstance(val, int)
 6606                    or isinstance(val, bool)
 6607                ):
 6608                    yield f"--{key} {val}"
 6609
 6610        # Genome
 6611        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6612        options["genome"] = genome
 6613        # NF params
 6614        nf_params = []
 6615        # Add options
 6616        if options:
 6617            log.debug(options)
 6618            nf_params = list(check_values(options))
 6619            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6620        else:
 6621            log.debug("No NF params provided")
 6622        # Add threads
 6623        if "threads" not in options.keys():
 6624            nf_params.append(f"--threads {threads}")
 6625        # Genome path
 6626        genome_path = find_genome(
 6627            config.get("folders", {})
 6628            .get("databases", {})
 6629            .get("genomes", DEFAULT_GENOME_FOLDER),
 6630            file=f"{genome}.fa",
 6631        )
 6632        # Add genome path
 6633        if not genome_path:
 6634            raise ValueError(
 6635                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6636            )
 6637        else:
 6638            log.debug(f"Genome: {genome_path}")
 6639            nf_params.append(f"--genome_path {genome_path}")
 6640
 6641        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6642            """
 6643            Setting up updated databases for SPiP and SpliceAI
 6644            """
 6645
 6646            try:
 6647
 6648                # SpliceAI assembly transcriptome
 6649                spliceai_assembly = os.path.join(
 6650                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6651                    options.get("genome"),
 6652                    "transcriptome",
 6653                )
 6654                spip_assembly = options.get("genome")
 6655
 6656                spip = find(
 6657                    f"transcriptome_{spip_assembly}.RData",
 6658                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6659                )
 6660                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6661                log.debug(f"SPiP annotations: {spip}")
 6662                log.debug(f"SpliceAI annotations: {spliceai}")
 6663                if spip and spliceai:
 6664                    return [
 6665                        f"--spip_transcriptome {spip}",
 6666                        f"--spliceai_transcriptome {spliceai}",
 6667                    ]
 6668                else:
 6669                    log.warning(
 6670                        "Can't find splice databases in configuration, use annotations file from image"
 6671                    )
 6672            except TypeError:
 6673                log.warning(
 6674                    "Can't find splice databases in configuration, use annotations file from image"
 6675                )
 6676                return []
 6677
 6678        # Add options, check if transcriptome option have already beend provided
 6679        if (
 6680            "spip_transcriptome" not in nf_params
 6681            and "spliceai_transcriptome" not in nf_params
 6682        ):
 6683            splice_reference = splice_annotations(options, config)
 6684            if splice_reference:
 6685                nf_params.extend(splice_reference)
 6686        # nf_params.append(f"--output_folder {output_folder}")
 6687        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6688        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6689        log.debug(cmd)
 6690        splice_config["docker"]["command"] = cmd
 6691
 6692        # Ensure proxy is set
 6693        proxy = [
 6694            f"-e {var}={os.getenv(var)}"
 6695            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6696            if os.getenv(var) is not None
 6697        ]
 6698        docker_cmd = get_bin_command(
 6699            tool="splice",
 6700            bin_type="docker",
 6701            config=config,
 6702            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6703            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6704        )
 6705        # print(docker_cmd)
 6706        # exit()
 6707        # Docker debug
 6708        # if splice_config.get("rm_container"):
 6709        #     rm_container = "--rm"
 6710        # else:
 6711        #     rm_container = ""
 6712        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6713        log.debug(docker_cmd)
 6714        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6715        log.debug(res.stdout)
 6716        if res.stderr:
 6717            log.error(res.stderr)
 6718        res.check_returncode()
 6719        # Update variants
 6720        log.info("Annotation - Updating...")
 6721        # Test find output vcf
 6722        log.debug(
 6723            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6724        )
 6725        output_vcf = []
 6726        # Wrong folder to look in
 6727        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6728            if (
 6729                files
 6730                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6731            ):
 6732                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6733        # log.debug(os.listdir(options.get("output_folder")))
 6734        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6735        if not output_vcf:
 6736            log.debug(
 6737                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6738            )
 6739        else:
 6740            # Get new header from annotated vcf
 6741            log.debug(f"Initial header: {len(header.infos)} fields")
 6742            # Create new header with splice infos
 6743            new_vcf = Variants(input=output_vcf[0])
 6744            new_vcf_header = new_vcf.get_header().infos
 6745            for keys, infos in new_vcf_header.items():
 6746                if keys not in header.infos.keys():
 6747                    header.infos[keys] = infos
 6748            log.debug(f"New header: {len(header.infos)} fields")
 6749            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6750            self.update_from_vcf(output_vcf[0])
 6751
 6752        # Remove file
 6753        remove_if_exists(output_vcf)
 6754
 6755    ###
 6756    # Prioritization
 6757    ###
 6758
 6759    def get_config_default(self, name: str) -> dict:
 6760        """
 6761        The function `get_config_default` returns a dictionary containing default configurations for
 6762        various calculations and prioritizations.
 6763
 6764        :param name: The `get_config_default` function returns a dictionary containing default
 6765        configurations for different calculations and prioritizations. The `name` parameter is used to
 6766        specify which specific configuration to retrieve from the dictionary
 6767        :type name: str
 6768        :return: The function `get_config_default` returns a dictionary containing default configuration
 6769        settings for different calculations and prioritizations. The specific configuration settings are
 6770        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6771        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6772        returned. If there is no match, an empty dictionary is returned.
 6773        """
 6774
 6775        config_default = {
 6776            "calculations": {
 6777                "variant_chr_pos_alt_ref": {
 6778                    "type": "sql",
 6779                    "name": "variant_chr_pos_alt_ref",
 6780                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6781                    "available": False,
 6782                    "output_column_name": "variant_chr_pos_alt_ref",
 6783                    "output_column_type": "String",
 6784                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6785                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6786                    "operation_info": True,
 6787                },
 6788                "VARTYPE": {
 6789                    "type": "sql",
 6790                    "name": "VARTYPE",
 6791                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6792                    "available": True,
 6793                    "table": "variants",
 6794                    "output_column_name": "VARTYPE",
 6795                    "output_column_type": "String",
 6796                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6797                    "operation_query": """
 6798                            CASE
 6799                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6800                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6801                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6802                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6803                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6804                                ELSE 'UNDEFINED'
 6805                            END
 6806                            """,
 6807                    "info_fields": ["SVTYPE"],
 6808                    "operation_info": True,
 6809                },
 6810                "snpeff_hgvs": {
 6811                    "type": "python",
 6812                    "name": "snpeff_hgvs",
 6813                    "description": "HGVS nomenclatures from snpEff annotation",
 6814                    "available": True,
 6815                    "function_name": "calculation_extract_snpeff_hgvs",
 6816                    "function_params": ["snpeff_hgvs", "ANN"],
 6817                },
 6818                "snpeff_ann_explode": {
 6819                    "type": "python",
 6820                    "name": "snpeff_ann_explode",
 6821                    "description": "Explode snpEff annotations with uniquify values",
 6822                    "available": True,
 6823                    "function_name": "calculation_snpeff_ann_explode",
 6824                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6825                },
 6826                "snpeff_ann_explode_uniquify": {
 6827                    "type": "python",
 6828                    "name": "snpeff_ann_explode_uniquify",
 6829                    "description": "Explode snpEff annotations",
 6830                    "available": True,
 6831                    "function_name": "calculation_snpeff_ann_explode",
 6832                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6833                },
 6834                "snpeff_ann_explode_json": {
 6835                    "type": "python",
 6836                    "name": "snpeff_ann_explode_json",
 6837                    "description": "Explode snpEff annotations in JSON format",
 6838                    "available": True,
 6839                    "function_name": "calculation_snpeff_ann_explode",
 6840                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6841                },
 6842                "NOMEN": {
 6843                    "type": "python",
 6844                    "name": "NOMEN",
 6845                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6846                    "available": True,
 6847                    "function_name": "calculation_extract_nomen",
 6848                    "function_params": [],
 6849                },
 6850                "RENAME_INFO_FIELDS": {
 6851                    "type": "python",
 6852                    "name": "RENAME_INFO_FIELDS",
 6853                    "description": "Rename or remove INFO/tags",
 6854                    "available": True,
 6855                    "function_name": "calculation_rename_info_fields",
 6856                    "function_params": [],
 6857                },
 6858                "FINDBYPIPELINE": {
 6859                    "type": "python",
 6860                    "name": "FINDBYPIPELINE",
 6861                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6862                    "available": True,
 6863                    "function_name": "calculation_find_by_pipeline",
 6864                    "function_params": ["findbypipeline"],
 6865                },
 6866                "FINDBYSAMPLE": {
 6867                    "type": "python",
 6868                    "name": "FINDBYSAMPLE",
 6869                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6870                    "available": True,
 6871                    "function_name": "calculation_find_by_pipeline",
 6872                    "function_params": ["findbysample"],
 6873                },
 6874                "GENOTYPECONCORDANCE": {
 6875                    "type": "python",
 6876                    "name": "GENOTYPECONCORDANCE",
 6877                    "description": "Concordance of genotype for multi caller VCF",
 6878                    "available": True,
 6879                    "function_name": "calculation_genotype_concordance",
 6880                    "function_params": [],
 6881                },
 6882                "BARCODE": {
 6883                    "type": "python",
 6884                    "name": "BARCODE",
 6885                    "description": "BARCODE as VaRank tool",
 6886                    "available": True,
 6887                    "function_name": "calculation_barcode",
 6888                    "function_params": [],
 6889                },
 6890                "BARCODEFAMILY": {
 6891                    "type": "python",
 6892                    "name": "BARCODEFAMILY",
 6893                    "description": "BARCODEFAMILY as VaRank tool",
 6894                    "available": True,
 6895                    "function_name": "calculation_barcode_family",
 6896                    "function_params": ["BCF"],
 6897                },
 6898                "TRIO": {
 6899                    "type": "python",
 6900                    "name": "TRIO",
 6901                    "description": "Inheritance for a trio family",
 6902                    "available": True,
 6903                    "function_name": "calculation_trio",
 6904                    "function_params": [],
 6905                },
 6906                "VAF": {
 6907                    "type": "python",
 6908                    "name": "VAF",
 6909                    "description": "Variant Allele Frequency (VAF) harmonization",
 6910                    "available": True,
 6911                    "function_name": "calculation_vaf_normalization",
 6912                    "function_params": [],
 6913                },
 6914                "VAF_stats": {
 6915                    "type": "python",
 6916                    "name": "VAF_stats",
 6917                    "description": "Variant Allele Frequency (VAF) statistics",
 6918                    "available": True,
 6919                    "function_name": "calculation_genotype_stats",
 6920                    "function_params": ["VAF"],
 6921                },
 6922                "DP_stats": {
 6923                    "type": "python",
 6924                    "name": "DP_stats",
 6925                    "description": "Depth (DP) statistics",
 6926                    "available": True,
 6927                    "function_name": "calculation_genotype_stats",
 6928                    "function_params": ["DP"],
 6929                },
 6930                "variant_id": {
 6931                    "type": "python",
 6932                    "name": "variant_id",
 6933                    "description": "Variant ID generated from variant position and type",
 6934                    "available": True,
 6935                    "function_name": "calculation_variant_id",
 6936                    "function_params": [],
 6937                },
 6938                "transcripts_json": {
 6939                    "type": "python",
 6940                    "name": "transcripts_json",
 6941                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6942                    "available": True,
 6943                    "function_name": "calculation_transcripts_annotation",
 6944                    "function_params": ["transcripts_json", None],
 6945                },
 6946                "transcripts_ann": {
 6947                    "type": "python",
 6948                    "name": "transcripts_ann",
 6949                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6950                    "available": True,
 6951                    "function_name": "calculation_transcripts_annotation",
 6952                    "function_params": [None, "transcripts_ann"],
 6953                },
 6954                "transcripts_annotations": {
 6955                    "type": "python",
 6956                    "name": "transcripts_annotations",
 6957                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6958                    "available": True,
 6959                    "function_name": "calculation_transcripts_annotation",
 6960                    "function_params": [None, None],
 6961                },
 6962                "transcripts_prioritization": {
 6963                    "type": "python",
 6964                    "name": "transcripts_prioritization",
 6965                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6966                    "available": True,
 6967                    "function_name": "calculation_transcripts_prioritization",
 6968                    "function_params": [],
 6969                },
 6970                "transcripts_export": {
 6971                    "type": "python",
 6972                    "name": "transcripts_export",
 6973                    "description": "Export transcripts table/view as a file (using param.json)",
 6974                    "available": True,
 6975                    "function_name": "calculation_transcripts_export",
 6976                    "function_params": [],
 6977                },
 6978            },
 6979            "prioritizations": {
 6980                "default": {
 6981                    "ANN2": [
 6982                        {
 6983                            "type": "contains",
 6984                            "value": "HIGH",
 6985                            "score": 5,
 6986                            "flag": "PASS",
 6987                            "comment": [
 6988                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6989                            ],
 6990                        },
 6991                        {
 6992                            "type": "contains",
 6993                            "value": "MODERATE",
 6994                            "score": 3,
 6995                            "flag": "PASS",
 6996                            "comment": [
 6997                                "A non-disruptive variant that might change protein effectiveness"
 6998                            ],
 6999                        },
 7000                        {
 7001                            "type": "contains",
 7002                            "value": "LOW",
 7003                            "score": 0,
 7004                            "flag": "FILTERED",
 7005                            "comment": [
 7006                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 7007                            ],
 7008                        },
 7009                        {
 7010                            "type": "contains",
 7011                            "value": "MODIFIER",
 7012                            "score": 0,
 7013                            "flag": "FILTERED",
 7014                            "comment": [
 7015                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 7016                            ],
 7017                        },
 7018                    ],
 7019                }
 7020            },
 7021        }
 7022
 7023        return config_default.get(name, None)
 7024
 7025    def get_config_json(
 7026        self, name: str, config_dict: dict = {}, config_file: str = None
 7027    ) -> dict:
 7028        """
 7029        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7030        default values, a dictionary, and a file.
 7031
 7032        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7033        the name of the configuration. It is used to identify and retrieve the configuration settings
 7034        for a specific component or module
 7035        :type name: str
 7036        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7037        dictionary that allows you to provide additional configuration settings or overrides. When you
 7038        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7039        the key is the configuration setting you want to override or
 7040        :type config_dict: dict
 7041        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7042        specify the path to a configuration file that contains additional settings. If provided, the
 7043        function will read the contents of this file and update the configuration dictionary with the
 7044        values found in the file, overriding any existing values with the
 7045        :type config_file: str
 7046        :return: The function `get_config_json` returns a dictionary containing the configuration
 7047        settings.
 7048        """
 7049
 7050        # Create with default prioritizations
 7051        config_default = self.get_config_default(name=name)
 7052        configuration = config_default
 7053        # log.debug(f"configuration={configuration}")
 7054
 7055        # Replace prioritizations from dict
 7056        for config in config_dict:
 7057            configuration[config] = config_dict[config]
 7058
 7059        # Replace prioritizations from file
 7060        config_file = full_path(config_file)
 7061        if config_file:
 7062            if os.path.exists(config_file):
 7063                with open(config_file) as config_file_content:
 7064                    config_file_dict = yaml.safe_load(config_file_content)
 7065                for config in config_file_dict:
 7066                    configuration[config] = config_file_dict[config]
 7067            else:
 7068                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7069                log.error(msg_error)
 7070                raise ValueError(msg_error)
 7071
 7072        return configuration
 7073
 7074    def prioritization(
 7075        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7076    ) -> bool:
 7077        """
 7078        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7079        prioritizes variants based on configured profiles and criteria.
 7080
 7081        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7082        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7083        a table name is provided, the method will prioritize the variants in that specific table
 7084        :type table: str
 7085        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7086        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7087        provided, the code will use a default prefix value of "PZ"
 7088        :type pz_prefix: str
 7089        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7090        additional parameters specific to the prioritization process. These parameters can include
 7091        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7092        configurations needed for the prioritization of variants in a V
 7093        :type pz_param: dict
 7094        :return: A boolean value (True) is being returned from the `prioritization` function.
 7095        """
 7096
 7097        # Config
 7098        config = self.get_config()
 7099
 7100        # Param
 7101        param = self.get_param()
 7102
 7103        # Prioritization param
 7104        if pz_param is not None:
 7105            prioritization_param = pz_param
 7106        else:
 7107            prioritization_param = param.get("prioritization", {})
 7108
 7109        # Configuration profiles
 7110        prioritization_config_file = prioritization_param.get(
 7111            "prioritization_config", None
 7112        )
 7113        prioritization_config_file = full_path(prioritization_config_file)
 7114        prioritizations_config = self.get_config_json(
 7115            name="prioritizations", config_file=prioritization_config_file
 7116        )
 7117
 7118        # Prioritization prefix
 7119        pz_prefix_default = "PZ"
 7120        if pz_prefix is None:
 7121            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7122
 7123        # Prioritization options
 7124        profiles = prioritization_param.get("profiles", [])
 7125        if isinstance(profiles, str):
 7126            profiles = profiles.split(",")
 7127        pzfields = prioritization_param.get(
 7128            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7129        )
 7130        if isinstance(pzfields, str):
 7131            pzfields = pzfields.split(",")
 7132        default_profile = prioritization_param.get("default_profile", None)
 7133        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7134        prioritization_score_mode = prioritization_param.get(
 7135            "prioritization_score_mode", "HOWARD"
 7136        )
 7137
 7138        # Quick Prioritizations
 7139        prioritizations = param.get("prioritizations", None)
 7140        if prioritizations:
 7141            log.info("Quick Prioritization:")
 7142            for profile in prioritizations.split(","):
 7143                if profile not in profiles:
 7144                    profiles.append(profile)
 7145                    log.info(f"   {profile}")
 7146
 7147        # If profile "ALL" provided, all profiles in the config profiles
 7148        if "ALL" in profiles:
 7149            profiles = list(prioritizations_config.keys())
 7150
 7151        for profile in profiles:
 7152            if prioritizations_config.get(profile, None):
 7153                log.debug(f"Profile '{profile}' configured")
 7154            else:
 7155                msg_error = f"Profile '{profile}' NOT configured"
 7156                log.error(msg_error)
 7157                raise ValueError(msg_error)
 7158
 7159        if profiles:
 7160            log.info(f"Prioritization... ")
 7161        else:
 7162            log.debug(f"No profile defined")
 7163            return False
 7164
 7165        if not default_profile and len(profiles):
 7166            default_profile = profiles[0]
 7167
 7168        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7169        log.debug("Profiles to check: " + str(list(profiles)))
 7170
 7171        # Variables
 7172        if table is not None:
 7173            table_variants = table
 7174        else:
 7175            table_variants = self.get_table_variants(clause="update")
 7176        log.debug(f"Table to prioritize: {table_variants}")
 7177
 7178        # Added columns
 7179        added_columns = []
 7180
 7181        # Create list of PZfields
 7182        # List of PZFields
 7183        list_of_pzfields_original = pzfields + [
 7184            pzfield + pzfields_sep + profile
 7185            for pzfield in pzfields
 7186            for profile in profiles
 7187        ]
 7188        list_of_pzfields = []
 7189        log.debug(f"{list_of_pzfields_original}")
 7190
 7191        # Remove existing PZfields to use if exists
 7192        for pzfield in list_of_pzfields_original:
 7193            if self.get_header().infos.get(pzfield, None) is None:
 7194                list_of_pzfields.append(pzfield)
 7195                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7196            else:
 7197                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7198
 7199        if list_of_pzfields:
 7200
 7201            # Explode Infos prefix
 7202            explode_infos_prefix = self.get_explode_infos_prefix()
 7203
 7204            # PZfields tags description
 7205            PZfields_INFOS = {
 7206                f"{pz_prefix}Tags": {
 7207                    "ID": f"{pz_prefix}Tags",
 7208                    "Number": ".",
 7209                    "Type": "String",
 7210                    "Description": "Variant tags based on annotation criteria",
 7211                },
 7212                f"{pz_prefix}Score": {
 7213                    "ID": f"{pz_prefix}Score",
 7214                    "Number": 1,
 7215                    "Type": "Integer",
 7216                    "Description": "Variant score based on annotation criteria",
 7217                },
 7218                f"{pz_prefix}Flag": {
 7219                    "ID": f"{pz_prefix}Flag",
 7220                    "Number": 1,
 7221                    "Type": "String",
 7222                    "Description": "Variant flag based on annotation criteria",
 7223                },
 7224                f"{pz_prefix}Comment": {
 7225                    "ID": f"{pz_prefix}Comment",
 7226                    "Number": ".",
 7227                    "Type": "String",
 7228                    "Description": "Variant comment based on annotation criteria",
 7229                },
 7230                f"{pz_prefix}Infos": {
 7231                    "ID": f"{pz_prefix}Infos",
 7232                    "Number": ".",
 7233                    "Type": "String",
 7234                    "Description": "Variant infos based on annotation criteria",
 7235                },
 7236                f"{pz_prefix}Class": {
 7237                    "ID": f"{pz_prefix}Class",
 7238                    "Number": ".",
 7239                    "Type": "String",
 7240                    "Description": "Variant class based on annotation criteria",
 7241                },
 7242            }
 7243
 7244            # Create INFO fields if not exist
 7245            for field in PZfields_INFOS:
 7246                field_ID = PZfields_INFOS[field]["ID"]
 7247                field_description = PZfields_INFOS[field]["Description"]
 7248                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7249                    field_description = (
 7250                        PZfields_INFOS[field]["Description"]
 7251                        + f", profile {default_profile}"
 7252                    )
 7253                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7254                        field_ID,
 7255                        PZfields_INFOS[field]["Number"],
 7256                        PZfields_INFOS[field]["Type"],
 7257                        field_description,
 7258                        "unknown",
 7259                        "unknown",
 7260                        code_type_map[PZfields_INFOS[field]["Type"]],
 7261                    )
 7262
 7263            # Create INFO fields if not exist for each profile
 7264            for profile in prioritizations_config:
 7265                if profile in profiles or profiles == []:
 7266                    for field in PZfields_INFOS:
 7267                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7268                        field_description = (
 7269                            PZfields_INFOS[field]["Description"]
 7270                            + f", profile {profile}"
 7271                        )
 7272                        if (
 7273                            field_ID not in self.get_header().infos
 7274                            and field in pzfields
 7275                        ):
 7276                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7277                                field_ID,
 7278                                PZfields_INFOS[field]["Number"],
 7279                                PZfields_INFOS[field]["Type"],
 7280                                field_description,
 7281                                "unknown",
 7282                                "unknown",
 7283                                code_type_map[PZfields_INFOS[field]["Type"]],
 7284                            )
 7285
 7286            # Header
 7287            for pzfield in list_of_pzfields:
 7288                if re.match(f"{pz_prefix}Score.*", pzfield):
 7289                    added_column = self.add_column(
 7290                        table_name=table_variants,
 7291                        column_name=pzfield,
 7292                        column_type="INTEGER",
 7293                        default_value="0",
 7294                    )
 7295                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7296                    added_column = self.add_column(
 7297                        table_name=table_variants,
 7298                        column_name=pzfield,
 7299                        column_type="BOOLEAN",
 7300                        default_value="1",
 7301                    )
 7302                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7303                    added_column = self.add_column(
 7304                        table_name=table_variants,
 7305                        column_name=pzfield,
 7306                        column_type="VARCHAR[]",
 7307                        default_value="null",
 7308                    )
 7309                else:
 7310                    added_column = self.add_column(
 7311                        table_name=table_variants,
 7312                        column_name=pzfield,
 7313                        column_type="STRING",
 7314                        default_value="''",
 7315                    )
 7316                added_columns.append(added_column)
 7317
 7318            # Profiles
 7319            if profiles:
 7320
 7321                # foreach profile in configuration file
 7322                for profile in prioritizations_config:
 7323
 7324                    # If profile is asked in param, or ALL are asked (empty profile [])
 7325                    if profile in profiles or profiles == []:
 7326                        log.info(f"Profile '{profile}'")
 7327
 7328                        sql_set_info_option = ""
 7329
 7330                        sql_set_info = []
 7331
 7332                        # PZ fields set
 7333
 7334                        # PZScore
 7335                        if (
 7336                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7337                            in list_of_pzfields
 7338                        ):
 7339                            sql_set_info.append(
 7340                                f"""
 7341                                    concat(
 7342                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7343                                        {pz_prefix}Score{pzfields_sep}{profile}
 7344                                    ) 
 7345                                """
 7346                            )
 7347                            if (
 7348                                profile == default_profile
 7349                                and f"{pz_prefix}Score" in list_of_pzfields
 7350                            ):
 7351                                sql_set_info.append(
 7352                                    f"""
 7353                                        concat(
 7354                                            '{pz_prefix}Score=',
 7355                                            {pz_prefix}Score{pzfields_sep}{profile}
 7356                                        )
 7357                                    """
 7358                                )
 7359
 7360                        # PZFlag
 7361                        if (
 7362                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7363                            in list_of_pzfields
 7364                        ):
 7365                            sql_set_info.append(
 7366                                f"""
 7367                                    concat(
 7368                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7369                                        CASE 
 7370                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7371                                            THEN 'PASS'
 7372                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7373                                            THEN 'FILTERED'
 7374                                        END
 7375                                    ) 
 7376                                """
 7377                            )
 7378                            if (
 7379                                profile == default_profile
 7380                                and f"{pz_prefix}Flag" in list_of_pzfields
 7381                            ):
 7382                                sql_set_info.append(
 7383                                    f"""
 7384                                        concat(
 7385                                            '{pz_prefix}Flag=',
 7386                                            CASE 
 7387                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7388                                                THEN 'PASS'
 7389                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7390                                                THEN 'FILTERED'
 7391                                            END
 7392                                        )
 7393                                    """
 7394                                )
 7395
 7396                        # PZClass
 7397                        if (
 7398                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7399                            in list_of_pzfields
 7400                        ):
 7401                            sql_set_info.append(
 7402                                f"""
 7403                                    concat(
 7404                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7405                                        CASE
 7406                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7407                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7408                                            ELSE '.'
 7409                                        END 
 7410                                    )
 7411                                    
 7412                                """
 7413                            )
 7414                            if (
 7415                                profile == default_profile
 7416                                and f"{pz_prefix}Class" in list_of_pzfields
 7417                            ):
 7418                                sql_set_info.append(
 7419                                    f"""
 7420                                        concat(
 7421                                            '{pz_prefix}Class=',
 7422                                            CASE
 7423                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7424                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7425                                                ELSE '.'
 7426                                            END 
 7427                                        )
 7428                                    """
 7429                                )
 7430
 7431                        # PZComment
 7432                        if (
 7433                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7434                            in list_of_pzfields
 7435                        ):
 7436                            sql_set_info.append(
 7437                                f"""
 7438                                    CASE
 7439                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7440                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7441                                        ELSE ''
 7442                                    END
 7443                                """
 7444                            )
 7445                            if (
 7446                                profile == default_profile
 7447                                and f"{pz_prefix}Comment" in list_of_pzfields
 7448                            ):
 7449                                sql_set_info.append(
 7450                                    f"""
 7451                                        CASE
 7452                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7453                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7454                                            ELSE ''
 7455                                        END
 7456                                    """
 7457                                )
 7458
 7459                        # PZInfos
 7460                        if (
 7461                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7462                            in list_of_pzfields
 7463                        ):
 7464                            sql_set_info.append(
 7465                                f"""
 7466                                    CASE
 7467                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7468                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7469                                        ELSE ''
 7470                                    END
 7471                                """
 7472                            )
 7473                            if (
 7474                                profile == default_profile
 7475                                and f"{pz_prefix}Infos" in list_of_pzfields
 7476                            ):
 7477                                sql_set_info.append(
 7478                                    f"""
 7479                                        CASE
 7480                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7481                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7482                                            ELSE ''
 7483                                        END
 7484                                    """
 7485                                )
 7486
 7487                        # Merge PZfields
 7488                        sql_set_info_option = ""
 7489                        sql_set_sep = ""
 7490                        for sql_set in sql_set_info:
 7491                            if sql_set_sep:
 7492                                sql_set_info_option += f"""
 7493                                    , concat('{sql_set_sep}', {sql_set})
 7494                                """
 7495                            else:
 7496                                sql_set_info_option += f"""
 7497                                    , {sql_set}
 7498                                """
 7499                            sql_set_sep = ";"
 7500
 7501                        sql_queries = []
 7502                        for annotation in prioritizations_config[profile]:
 7503
 7504                            # skip special sections
 7505                            if annotation.startswith("_"):
 7506                                continue
 7507
 7508                            # For each criterions
 7509                            for criterion in prioritizations_config[profile][
 7510                                annotation
 7511                            ]:
 7512
 7513                                # Criterion mode
 7514                                criterion_mode = None
 7515                                if np.any(
 7516                                    np.isin(list(criterion.keys()), ["type", "value"])
 7517                                ):
 7518                                    criterion_mode = "operation"
 7519                                elif np.any(
 7520                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7521                                ):
 7522                                    criterion_mode = "sql"
 7523                                log.debug(f"Criterion Mode: {criterion_mode}")
 7524
 7525                                # Criterion parameters
 7526                                criterion_type = criterion.get("type", None)
 7527                                criterion_value = criterion.get("value", None)
 7528                                criterion_sql = criterion.get("sql", None)
 7529                                criterion_fields = criterion.get("fields", None)
 7530                                criterion_score = criterion.get("score", 0)
 7531                                criterion_flag = criterion.get("flag", "PASS")
 7532                                criterion_class = criterion.get("class", None)
 7533                                criterion_flag_bool = criterion_flag == "PASS"
 7534                                criterion_comment = (
 7535                                    ", ".join(criterion.get("comment", []))
 7536                                    .replace("'", "''")
 7537                                    .replace(";", ",")
 7538                                    .replace("\t", " ")
 7539                                )
 7540                                criterion_infos = (
 7541                                    str(criterion)
 7542                                    .replace("'", "''")
 7543                                    .replace(";", ",")
 7544                                    .replace("\t", " ")
 7545                                )
 7546
 7547                                # SQL
 7548                                if criterion_sql is not None and isinstance(
 7549                                    criterion_sql, list
 7550                                ):
 7551                                    criterion_sql = " ".join(criterion_sql)
 7552
 7553                                # Fields and explode
 7554                                if criterion_fields is None:
 7555                                    criterion_fields = [annotation]
 7556                                if not isinstance(criterion_fields, list):
 7557                                    criterion_fields = str(criterion_fields).split(",")
 7558
 7559                                # Class
 7560                                if criterion_class is not None and not isinstance(
 7561                                    criterion_class, list
 7562                                ):
 7563                                    criterion_class = str(criterion_class).split(",")
 7564
 7565                                for annotation_field in criterion_fields:
 7566
 7567                                    # Explode specific annotation
 7568                                    log.debug(
 7569                                        f"Explode annotation '{annotation_field}'"
 7570                                    )
 7571                                    added_columns += self.explode_infos(
 7572                                        prefix=explode_infos_prefix,
 7573                                        fields=[annotation_field],
 7574                                        table=table_variants,
 7575                                    )
 7576                                    extra_infos = self.get_extra_infos(
 7577                                        table=table_variants
 7578                                    )
 7579
 7580                                    # Check if annotation field is present
 7581                                    if (
 7582                                        f"{explode_infos_prefix}{annotation_field}"
 7583                                        not in extra_infos
 7584                                    ):
 7585                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7586                                        log.error(msq_err)
 7587                                        raise ValueError(msq_err)
 7588                                    else:
 7589                                        log.debug(
 7590                                            f"Annotation '{annotation_field}' in data"
 7591                                        )
 7592
 7593                                sql_set = []
 7594                                sql_set_info = []
 7595
 7596                                # PZ fields set
 7597
 7598                                # PZScore
 7599                                if (
 7600                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7601                                    in list_of_pzfields
 7602                                ):
 7603                                    # VaRank prioritization score mode
 7604                                    if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]:
 7605                                        sql_set.append(
 7606                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
 7607                                        )
 7608                                    # default HOWARD prioritization score mode
 7609                                    else:
 7610                                        sql_set.append(
 7611                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7612                                        )
 7613
 7614                                # PZFlag
 7615                                if (
 7616                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7617                                    in list_of_pzfields
 7618                                ):
 7619                                    sql_set.append(
 7620                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7621                                    )
 7622
 7623                                # PZClass
 7624                                if (
 7625                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7626                                    in list_of_pzfields
 7627                                    and criterion_class is not None
 7628                                ):
 7629                                    sql_set.append(
 7630                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7631                                    )
 7632
 7633                                # PZComment
 7634                                if (
 7635                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7636                                    in list_of_pzfields
 7637                                ):
 7638                                    sql_set.append(
 7639                                        f"""
 7640                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7641                                                concat(
 7642                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7643                                                    CASE 
 7644                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7645                                                        THEN ', '
 7646                                                        ELSE ''
 7647                                                    END,
 7648                                                    '{criterion_comment}'
 7649                                                )
 7650                                        """
 7651                                    )
 7652
 7653                                # PZInfos
 7654                                if (
 7655                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7656                                    in list_of_pzfields
 7657                                ):
 7658                                    sql_set.append(
 7659                                        f"""
 7660                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7661                                                concat(
 7662                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7663                                                    '{criterion_infos}'
 7664                                                )
 7665                                        """
 7666                                    )
 7667                                sql_set_option = ",".join(sql_set)
 7668
 7669                                # Criterion and comparison
 7670                                if sql_set_option:
 7671
 7672                                    if criterion_mode in ["operation"]:
 7673
 7674                                        try:
 7675                                            float(criterion_value)
 7676                                            sql_update = f"""
 7677                                                UPDATE {table_variants}
 7678                                                SET {sql_set_option}
 7679                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7680                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7681                                            """
 7682                                        except:
 7683                                            contains_option = ""
 7684                                            if criterion_type == "contains":
 7685                                                contains_option = ".*"
 7686                                            sql_update = f"""
 7687                                                UPDATE {table_variants}
 7688                                                SET {sql_set_option}
 7689                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7690                                            """
 7691                                        sql_queries.append(sql_update)
 7692
 7693                                    elif criterion_mode in ["sql"]:
 7694
 7695                                        sql_update = f"""
 7696                                            UPDATE {table_variants}
 7697                                            SET {sql_set_option}
 7698                                            WHERE {criterion_sql}
 7699                                        """
 7700                                        sql_queries.append(sql_update)
 7701
 7702                                    else:
 7703                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7704                                        log.error(msg_err)
 7705                                        raise ValueError(msg_err)
 7706
 7707                                else:
 7708                                    log.warning(
 7709                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7710                                    )
 7711
 7712                        # PZTags
 7713                        if (
 7714                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7715                            in list_of_pzfields
 7716                        ):
 7717
 7718                            # Create PZFalgs value
 7719                            pztags_value = ""
 7720                            pztags_sep_default = ","
 7721                            pztags_sep = ""
 7722                            for pzfield in pzfields:
 7723                                if pzfield not in [f"{pz_prefix}Tags"]:
 7724                                    if (
 7725                                        f"{pzfield}{pzfields_sep}{profile}"
 7726                                        in list_of_pzfields
 7727                                    ):
 7728                                        if pzfield in [f"{pz_prefix}Flag"]:
 7729                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7730                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7731                                                    THEN 'PASS'
 7732                                                    ELSE 'FILTERED'
 7733                                                END, '"""
 7734                                        elif pzfield in [f"{pz_prefix}Class"]:
 7735                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7736                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7737                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7738                                                    ELSE '.'
 7739                                                END, '"""
 7740                                        else:
 7741                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7742                                        pztags_sep = pztags_sep_default
 7743
 7744                            # Add Query update for PZFlags
 7745                            sql_update_pztags = f"""
 7746                                UPDATE {table_variants}
 7747                                SET INFO = concat(
 7748                                        INFO,
 7749                                        CASE WHEN INFO NOT in ('','.')
 7750                                                THEN ';'
 7751                                                ELSE ''
 7752                                        END,
 7753                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7754                                    )
 7755                                """
 7756                            sql_queries.append(sql_update_pztags)
 7757
 7758                            # Add Query update for PZFlags for default
 7759                            if profile == default_profile:
 7760                                sql_update_pztags_default = f"""
 7761                                UPDATE {table_variants}
 7762                                SET INFO = concat(
 7763                                        INFO,
 7764                                        ';',
 7765                                        '{pz_prefix}Tags={pztags_value}'
 7766                                    )
 7767                                """
 7768                                sql_queries.append(sql_update_pztags_default)
 7769
 7770                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7771
 7772                        if sql_queries:
 7773
 7774                            for sql_query in sql_queries:
 7775                                log.debug(
 7776                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7777                                )
 7778                                self.conn.execute(sql_query)
 7779
 7780                        log.info(f"""Profile '{profile}' - Update... """)
 7781                        sql_query_update = f"""
 7782                            UPDATE {table_variants}
 7783                            SET INFO =  
 7784                                concat(
 7785                                    CASE
 7786                                        WHEN INFO NOT IN ('','.')
 7787                                        THEN concat(INFO, ';')
 7788                                        ELSE ''
 7789                                    END
 7790                                    {sql_set_info_option}
 7791                                )
 7792                        """
 7793                        self.conn.execute(sql_query_update)
 7794
 7795        else:
 7796
 7797            log.warning(f"No profiles in parameters")
 7798
 7799        # Remove added columns
 7800        for added_column in added_columns:
 7801            self.drop_column(column=added_column)
 7802
 7803        # Explode INFOS fields into table fields
 7804        if self.get_explode_infos():
 7805            self.explode_infos(
 7806                prefix=self.get_explode_infos_prefix(),
 7807                fields=self.get_explode_infos_fields(),
 7808                force=True,
 7809            )
 7810
 7811        return True
 7812
 7813    ###
 7814    # HGVS
 7815    ###
 7816
 7817    def annotation_hgvs(self, threads: int = None) -> None:
 7818        """
 7819        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7820        coordinates and alleles.
 7821
 7822        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7823        threads to use for parallel processing. If no value is provided, it will default to the number
 7824        of threads obtained from the `get_threads()` method
 7825        :type threads: int
 7826        """
 7827
 7828        # Function for each partition of the Dask Dataframe
 7829        def partition_function(partition):
 7830            """
 7831            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7832            each row of a DataFrame called `partition`.
 7833
 7834            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7835            to be processed
 7836            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7837            the "partition" dataframe along the axis 1.
 7838            """
 7839            return partition.apply(annotation_hgvs_partition, axis=1)
 7840
 7841        def annotation_hgvs_partition(row) -> str:
 7842            """
 7843            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7844            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7845
 7846            :param row: A dictionary-like object that contains the values for the following keys:
 7847            :return: a string that contains the HGVS names associated with the given row of data.
 7848            """
 7849
 7850            chr = row["CHROM"]
 7851            pos = row["POS"]
 7852            ref = row["REF"]
 7853            alt = row["ALT"]
 7854
 7855            # Find list of associated transcripts
 7856            transcripts_list = list(
 7857                polars_conn.execute(
 7858                    f"""
 7859                SELECT transcript
 7860                FROM refseq_df
 7861                WHERE CHROM='{chr}'
 7862                AND POS={pos}
 7863            """
 7864                )["transcript"]
 7865            )
 7866
 7867            # Full HGVS annotation in list
 7868            hgvs_full_list = []
 7869
 7870            for transcript_name in transcripts_list:
 7871
 7872                # Transcript
 7873                transcript = get_transcript(
 7874                    transcripts=transcripts, transcript_name=transcript_name
 7875                )
 7876                # Exon
 7877                if use_exon:
 7878                    exon = transcript.find_exon_number(pos)
 7879                else:
 7880                    exon = None
 7881                # Protein
 7882                transcript_protein = None
 7883                if use_protein or add_protein or full_format:
 7884                    transcripts_protein = list(
 7885                        polars_conn.execute(
 7886                            f"""
 7887                        SELECT protein
 7888                        FROM refseqlink_df
 7889                        WHERE transcript='{transcript_name}'
 7890                        LIMIT 1
 7891                    """
 7892                        )["protein"]
 7893                    )
 7894                    if len(transcripts_protein):
 7895                        transcript_protein = transcripts_protein[0]
 7896
 7897                # HGVS name
 7898                hgvs_name = format_hgvs_name(
 7899                    chr,
 7900                    pos,
 7901                    ref,
 7902                    alt,
 7903                    genome=genome,
 7904                    transcript=transcript,
 7905                    transcript_protein=transcript_protein,
 7906                    exon=exon,
 7907                    use_gene=use_gene,
 7908                    use_protein=use_protein,
 7909                    full_format=full_format,
 7910                    use_version=use_version,
 7911                    codon_type=codon_type,
 7912                )
 7913                hgvs_full_list.append(hgvs_name)
 7914                if add_protein and not use_protein and not full_format:
 7915                    hgvs_name = format_hgvs_name(
 7916                        chr,
 7917                        pos,
 7918                        ref,
 7919                        alt,
 7920                        genome=genome,
 7921                        transcript=transcript,
 7922                        transcript_protein=transcript_protein,
 7923                        exon=exon,
 7924                        use_gene=use_gene,
 7925                        use_protein=True,
 7926                        full_format=False,
 7927                        use_version=use_version,
 7928                        codon_type=codon_type,
 7929                    )
 7930                    hgvs_full_list.append(hgvs_name)
 7931
 7932            # Create liste of HGVS annotations
 7933            hgvs_full = ",".join(hgvs_full_list)
 7934
 7935            return hgvs_full
 7936
 7937        # Polars connexion
 7938        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7939
 7940        # Config
 7941        config = self.get_config()
 7942
 7943        # Databases
 7944        # Genome
 7945        databases_genomes_folders = (
 7946            config.get("folders", {})
 7947            .get("databases", {})
 7948            .get("genomes", DEFAULT_GENOME_FOLDER)
 7949        )
 7950        databases_genome = (
 7951            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7952        )
 7953        # refseq database folder
 7954        databases_refseq_folders = (
 7955            config.get("folders", {})
 7956            .get("databases", {})
 7957            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7958        )
 7959        # refseq
 7960        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7961        # refSeqLink
 7962        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7963
 7964        # Param
 7965        param = self.get_param()
 7966
 7967        # Quick HGVS
 7968        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7969            log.info(f"Quick HGVS Annotation:")
 7970            if not param.get("hgvs", None):
 7971                param["hgvs"] = {}
 7972            for option in param.get("hgvs_options", "").split(","):
 7973                option_var_val = option.split("=")
 7974                option_var = option_var_val[0]
 7975                if len(option_var_val) > 1:
 7976                    option_val = option_var_val[1]
 7977                else:
 7978                    option_val = "True"
 7979                if option_val.upper() in ["TRUE"]:
 7980                    option_val = True
 7981                elif option_val.upper() in ["FALSE"]:
 7982                    option_val = False
 7983                log.info(f"   {option_var}={option_val}")
 7984                param["hgvs"][option_var] = option_val
 7985
 7986        # Check if HGVS annotation enabled
 7987        if "hgvs" in param:
 7988            log.info(f"HGVS Annotation... ")
 7989            for hgvs_option in param.get("hgvs", {}):
 7990                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7991        else:
 7992            return
 7993
 7994        # HGVS Param
 7995        param_hgvs = param.get("hgvs", {})
 7996        use_exon = param_hgvs.get("use_exon", False)
 7997        use_gene = param_hgvs.get("use_gene", False)
 7998        use_protein = param_hgvs.get("use_protein", False)
 7999        add_protein = param_hgvs.get("add_protein", False)
 8000        full_format = param_hgvs.get("full_format", False)
 8001        use_version = param_hgvs.get("use_version", False)
 8002        codon_type = param_hgvs.get("codon_type", "3")
 8003
 8004        # refSseq refSeqLink
 8005        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 8006        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 8007
 8008        # Assembly
 8009        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 8010
 8011        # Genome
 8012        genome_file = None
 8013        if find_genome(databases_genome):
 8014            genome_file = find_genome(databases_genome)
 8015        else:
 8016            genome_file = find_genome(
 8017                genome_path=databases_genomes_folders, assembly=assembly
 8018            )
 8019        log.debug("Genome: " + str(genome_file))
 8020
 8021        # refSseq
 8022        refseq_file = find_file_prefix(
 8023            input_file=databases_refseq,
 8024            prefix="ncbiRefSeq",
 8025            folder=databases_refseq_folders,
 8026            assembly=assembly,
 8027        )
 8028        log.debug("refSeq: " + str(refseq_file))
 8029
 8030        # refSeqLink
 8031        refseqlink_file = find_file_prefix(
 8032            input_file=databases_refseqlink,
 8033            prefix="ncbiRefSeqLink",
 8034            folder=databases_refseq_folders,
 8035            assembly=assembly,
 8036        )
 8037        log.debug("refSeqLink: " + str(refseqlink_file))
 8038
 8039        # Threads
 8040        if not threads:
 8041            threads = self.get_threads()
 8042        log.debug("Threads: " + str(threads))
 8043
 8044        # Variables
 8045        table_variants = self.get_table_variants(clause="update")
 8046
 8047        # Get variants SNV and InDel only
 8048        query_variants = f"""
 8049            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8050            FROM {table_variants}
 8051            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8052            """
 8053        df_variants = self.get_query_to_df(query_variants)
 8054
 8055        # Added columns
 8056        added_columns = []
 8057
 8058        # Add hgvs column in variants table
 8059        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8060        added_column = self.add_column(
 8061            table_variants, hgvs_column_name, "STRING", default_value=None
 8062        )
 8063        added_columns.append(added_column)
 8064
 8065        log.debug(f"refSeq loading...")
 8066        # refSeq in duckDB
 8067        refseq_table = get_refseq_table(
 8068            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8069        )
 8070        # Loading all refSeq in Dataframe
 8071        refseq_query = f"""
 8072            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8073            FROM {refseq_table}
 8074            JOIN df_variants ON (
 8075                {refseq_table}.chrom = df_variants.CHROM
 8076                AND {refseq_table}.txStart<=df_variants.POS
 8077                AND {refseq_table}.txEnd>=df_variants.POS
 8078            )
 8079        """
 8080        refseq_df = self.conn.query(refseq_query).pl()
 8081
 8082        if refseqlink_file:
 8083            log.debug(f"refSeqLink loading...")
 8084            # refSeqLink in duckDB
 8085            refseqlink_table = get_refseq_table(
 8086                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8087            )
 8088            # Loading all refSeqLink in Dataframe
 8089            protacc_column = "protAcc_with_ver"
 8090            mrnaacc_column = "mrnaAcc_with_ver"
 8091            refseqlink_query = f"""
 8092                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8093                FROM {refseqlink_table} 
 8094                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8095                WHERE protAcc_without_ver IS NOT NULL
 8096            """
 8097            # Polars Dataframe
 8098            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8099
 8100        # Read RefSeq transcripts into a python dict/model.
 8101        log.debug(f"Transcripts loading...")
 8102        with tempfile.TemporaryDirectory() as tmpdir:
 8103            transcripts_query = f"""
 8104                COPY (
 8105                    SELECT {refseq_table}.*
 8106                    FROM {refseq_table}
 8107                    JOIN df_variants ON (
 8108                        {refseq_table}.chrom=df_variants.CHROM
 8109                        AND {refseq_table}.txStart<=df_variants.POS
 8110                        AND {refseq_table}.txEnd>=df_variants.POS
 8111                    )
 8112                )
 8113                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8114            """
 8115            self.conn.query(transcripts_query)
 8116            with open(f"{tmpdir}/transcript.tsv") as infile:
 8117                transcripts = read_transcripts(infile)
 8118
 8119        # Polars connexion
 8120        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8121
 8122        log.debug("Genome loading...")
 8123        # Read genome sequence using pyfaidx.
 8124        genome = Fasta(genome_file)
 8125
 8126        log.debug("Start annotation HGVS...")
 8127
 8128        # Create
 8129        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8130        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8131
 8132        # Use dask.dataframe.apply() to apply function on each partition
 8133        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8134
 8135        # Convert Dask DataFrame to Pandas Dataframe
 8136        df = ddf.compute()
 8137
 8138        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8139        with tempfile.TemporaryDirectory() as tmpdir:
 8140            df_parquet = os.path.join(tmpdir, "df.parquet")
 8141            df.to_parquet(df_parquet)
 8142
 8143            # Update hgvs column
 8144            update_variant_query = f"""
 8145                UPDATE {table_variants}
 8146                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8147                FROM read_parquet('{df_parquet}') as df
 8148                WHERE variants."#CHROM" = df.CHROM
 8149                AND variants.POS = df.POS
 8150                AND variants.REF = df.REF
 8151                AND variants.ALT = df.ALT
 8152                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8153                """
 8154            self.execute_query(update_variant_query)
 8155
 8156        # Update INFO column
 8157        sql_query_update = f"""
 8158            UPDATE {table_variants}
 8159            SET INFO = 
 8160                concat(
 8161                    CASE 
 8162                        WHEN INFO NOT IN ('','.')
 8163                        THEN concat(INFO, ';')
 8164                        ELSE ''
 8165                    END,
 8166                    'hgvs=',
 8167                    {hgvs_column_name}
 8168                )
 8169            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8170            """
 8171        self.execute_query(sql_query_update)
 8172
 8173        # Add header
 8174        HGVS_INFOS = {
 8175            "hgvs": {
 8176                "ID": "hgvs",
 8177                "Number": ".",
 8178                "Type": "String",
 8179                "Description": f"HGVS annotatation with HOWARD",
 8180            }
 8181        }
 8182
 8183        for field in HGVS_INFOS:
 8184            field_ID = HGVS_INFOS[field]["ID"]
 8185            field_description = HGVS_INFOS[field]["Description"]
 8186            self.get_header().infos[field_ID] = vcf.parser._Info(
 8187                field_ID,
 8188                HGVS_INFOS[field]["Number"],
 8189                HGVS_INFOS[field]["Type"],
 8190                field_description,
 8191                "unknown",
 8192                "unknown",
 8193                code_type_map[HGVS_INFOS[field]["Type"]],
 8194            )
 8195
 8196        # Remove added columns
 8197        for added_column in added_columns:
 8198            self.drop_column(column=added_column)
 8199
 8200    ###
 8201    # Calculation
 8202    ###
 8203
 8204    def get_operations_help(
 8205        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8206    ) -> list:
 8207
 8208        # Init
 8209        operations_help = []
 8210
 8211        # operations
 8212        operations = self.get_config_json(
 8213            name="calculations",
 8214            config_dict=operations_config_dict,
 8215            config_file=operations_config_file,
 8216        )
 8217        for op in operations:
 8218            op_name = operations[op].get("name", op).upper()
 8219            op_description = operations[op].get("description", op_name)
 8220            op_available = operations[op].get("available", False)
 8221            if op_available:
 8222                operations_help.append(f"   {op_name}: {op_description}")
 8223
 8224        # Sort operations
 8225        operations_help.sort()
 8226
 8227        # insert header
 8228        operations_help.insert(0, "Available calculation operations:")
 8229
 8230        # Return
 8231        return operations_help
 8232
 8233    def calculation(
 8234        self,
 8235        operations: dict = {},
 8236        operations_config_dict: dict = {},
 8237        operations_config_file: str = None,
 8238    ) -> None:
 8239        """
 8240        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8241        operation, and then calls the appropriate function
 8242
 8243        param json example:
 8244            "calculation": {
 8245                "NOMEN": {
 8246                    "options": {
 8247                        "hgvs_field": "hgvs"
 8248                    },
 8249                "middle" : null
 8250            }
 8251        """
 8252
 8253        # Param
 8254        param = self.get_param()
 8255
 8256        # CHeck operations config file
 8257        if operations_config_file is None:
 8258            operations_config_file = param.get("calculation", {}).get(
 8259                "calculation_config", None
 8260            )
 8261
 8262        # operations config
 8263        operations_config = self.get_config_json(
 8264            name="calculations",
 8265            config_dict=operations_config_dict,
 8266            config_file=operations_config_file,
 8267        )
 8268
 8269        # Upper keys
 8270        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8271
 8272        # Calculations
 8273
 8274        # Operations from param
 8275        operations = param.get("calculation", {}).get("calculations", operations)
 8276
 8277        # Quick calculation - add
 8278        if param.get("calculations", None):
 8279
 8280            # List of operations
 8281            calculations_list = [
 8282                value.strip() for value in param.get("calculations", "").split(",")
 8283            ]
 8284
 8285            # Log
 8286            log.info(f"Quick Calculations:")
 8287            for calculation_key in calculations_list:
 8288                log.info(f"   {calculation_key}")
 8289
 8290            # Create tmp operations (to keep operation order)
 8291            operations_tmp = {}
 8292            for calculation_operation in calculations_list:
 8293                if calculation_operation.upper() not in operations_tmp:
 8294                    log.debug(
 8295                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8296                    )
 8297                    operations_tmp[calculation_operation.upper()] = {}
 8298                    add_value_into_dict(
 8299                        dict_tree=operations_tmp,
 8300                        sections=[
 8301                            calculation_operation.upper(),
 8302                        ],
 8303                        value=operations.get(calculation_operation.upper(), {}),
 8304                    )
 8305            # Add operations already in param
 8306            for calculation_operation in operations:
 8307                if calculation_operation not in operations_tmp:
 8308                    operations_tmp[calculation_operation] = operations.get(
 8309                        calculation_operation, {}
 8310                    )
 8311
 8312            # Update operations in param
 8313            operations = operations_tmp
 8314
 8315        # Operations for calculation
 8316        if not operations:
 8317            operations = param.get("calculation", {}).get("calculations", {})
 8318
 8319        if operations:
 8320            log.info(f"Calculations...")
 8321
 8322        # For each operations
 8323        for operation_name in operations:
 8324            operation_name = operation_name.upper()
 8325            if operation_name not in [""]:
 8326                if operation_name in operations_config:
 8327                    log.info(f"Calculation '{operation_name}'")
 8328                    operation = operations_config[operation_name]
 8329                    operation_type = operation.get("type", "sql")
 8330                    if operation_type == "python":
 8331                        self.calculation_process_function(
 8332                            operation=operation, operation_name=operation_name
 8333                        )
 8334                    elif operation_type == "sql":
 8335                        self.calculation_process_sql(
 8336                            operation=operation, operation_name=operation_name
 8337                        )
 8338                    else:
 8339                        log.error(
 8340                            f"Operations config: Type '{operation_type}' NOT available"
 8341                        )
 8342                        raise ValueError(
 8343                            f"Operations config: Type '{operation_type}' NOT available"
 8344                        )
 8345                else:
 8346                    log.error(
 8347                        f"Operations config: Calculation '{operation_name}' NOT available"
 8348                    )
 8349                    raise ValueError(
 8350                        f"Operations config: Calculation '{operation_name}' NOT available"
 8351                    )
 8352
 8353        # Explode INFOS fields into table fields
 8354        if self.get_explode_infos():
 8355            self.explode_infos(
 8356                prefix=self.get_explode_infos_prefix(),
 8357                fields=self.get_explode_infos_fields(),
 8358                force=True,
 8359            )
 8360
 8361    def calculation_process_sql(
 8362        self, operation: dict, operation_name: str = "unknown"
 8363    ) -> None:
 8364        """
 8365        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8366        performs the operation, updating the specified table with the result.
 8367
 8368        :param operation: The `operation` parameter is a dictionary that contains information about the
 8369        mathematical operation to be performed. It includes the following keys:
 8370        :type operation: dict
 8371        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8372        the mathematical operation being performed. It is used for logging and error handling purposes,
 8373        defaults to unknown
 8374        :type operation_name: str (optional)
 8375        """
 8376
 8377        # Operation infos
 8378        operation_name = operation.get("name", "unknown")
 8379        log.debug(f"process SQL {operation_name}")
 8380        output_column_name = operation.get("output_column_name", operation_name)
 8381        output_column_type = operation.get("output_column_type", "String")
 8382        prefix = operation.get("explode_infos_prefix", "")
 8383        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8384        output_column_description = operation.get(
 8385            "output_column_description", f"{operation_name} operation"
 8386        )
 8387        operation_query = operation.get("operation_query", None)
 8388        if isinstance(operation_query, list):
 8389            operation_query = " ".join(operation_query)
 8390        operation_info_fields = operation.get("info_fields", [])
 8391        operation_info_fields_check = operation.get("info_fields_check", False)
 8392        operation_info = operation.get("operation_info", True)
 8393        operation_table = operation.get(
 8394            "table", self.get_table_variants(clause="alter")
 8395        )
 8396
 8397        # table variants
 8398        if operation_table:
 8399            table_variants = operation_table
 8400        else:
 8401            table_variants = self.get_table_variants(clause="alter")
 8402
 8403        if operation_query:
 8404
 8405            # Info fields check
 8406            operation_info_fields_check_result = True
 8407            if operation_info_fields_check:
 8408                header_infos = self.get_header().infos
 8409                for info_field in operation_info_fields:
 8410                    operation_info_fields_check_result = (
 8411                        operation_info_fields_check_result
 8412                        and info_field in header_infos
 8413                    )
 8414
 8415            # If info fields available
 8416            if operation_info_fields_check_result:
 8417
 8418                # Added_columns
 8419                added_columns = []
 8420
 8421                # Create VCF header field
 8422                vcf_reader = self.get_header()
 8423                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8424                    output_column_name,
 8425                    ".",
 8426                    output_column_type,
 8427                    output_column_description,
 8428                    "howard calculation",
 8429                    "0",
 8430                    self.code_type_map.get(output_column_type),
 8431                )
 8432
 8433                # Explode infos if needed
 8434                log.debug(f"calculation_process_sql prefix {prefix}")
 8435                added_columns += self.explode_infos(
 8436                    prefix=prefix,
 8437                    fields=[output_column_name] + operation_info_fields,
 8438                    force=False,
 8439                    table=table_variants,
 8440                )
 8441
 8442                # Create column
 8443                added_column = self.add_column(
 8444                    table_name=table_variants,
 8445                    column_name=prefix + output_column_name,
 8446                    column_type=output_column_type_sql,
 8447                    default_value="null",
 8448                )
 8449                added_columns.append(added_column)
 8450
 8451                # Operation calculation
 8452                try:
 8453
 8454                    # Query to update calculation column
 8455                    sql_update = f"""
 8456                        UPDATE {table_variants}
 8457                        SET "{prefix}{output_column_name}" = ({operation_query})
 8458                    """
 8459                    self.conn.execute(sql_update)
 8460
 8461                    # Add to INFO
 8462                    if operation_info:
 8463                        sql_update_info = f"""
 8464                            UPDATE {table_variants}
 8465                            SET "INFO" =
 8466                                concat(
 8467                                    CASE
 8468                                        WHEN "INFO" IS NOT NULL
 8469                                        THEN concat("INFO", ';')
 8470                                        ELSE ''
 8471                                    END,
 8472                                    '{output_column_name}=',
 8473                                    "{prefix}{output_column_name}"
 8474                                )
 8475                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8476                        """
 8477                        self.conn.execute(sql_update_info)
 8478
 8479                except:
 8480                    log.error(
 8481                        f"Operations config: Calculation '{operation_name}' query failed"
 8482                    )
 8483                    raise ValueError(
 8484                        f"Operations config: Calculation '{operation_name}' query failed"
 8485                    )
 8486
 8487                # Remove added columns
 8488                for added_column in added_columns:
 8489                    log.debug(f"added_column: {added_column}")
 8490                    self.drop_column(column=added_column)
 8491
 8492            else:
 8493                log.error(
 8494                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8495                )
 8496                raise ValueError(
 8497                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8498                )
 8499
 8500        else:
 8501            log.error(
 8502                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8503            )
 8504            raise ValueError(
 8505                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8506            )
 8507
 8508    def calculation_process_function(
 8509        self, operation: dict, operation_name: str = "unknown"
 8510    ) -> None:
 8511        """
 8512        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8513        function with the given parameters.
 8514
 8515        :param operation: The `operation` parameter is a dictionary that contains information about the
 8516        operation to be performed. It has the following keys:
 8517        :type operation: dict
 8518        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8519        the operation being performed. It is used for logging purposes, defaults to unknown
 8520        :type operation_name: str (optional)
 8521        """
 8522
 8523        operation_name = operation["name"]
 8524        log.debug(f"process Python {operation_name}")
 8525        function_name = operation["function_name"]
 8526        function_params = operation["function_params"]
 8527        getattr(self, function_name)(*function_params)
 8528
 8529    def calculation_variant_id(self) -> None:
 8530        """
 8531        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8532        updates the INFO field of a variants table with the variant ID.
 8533        """
 8534
 8535        # variant_id annotation field
 8536        variant_id_tag = self.get_variant_id_column()
 8537        added_columns = [variant_id_tag]
 8538
 8539        # variant_id hgvs tags"
 8540        vcf_infos_tags = {
 8541            variant_id_tag: "howard variant ID annotation",
 8542        }
 8543
 8544        # Variants table
 8545        table_variants = self.get_table_variants()
 8546
 8547        # Header
 8548        vcf_reader = self.get_header()
 8549
 8550        # Add variant_id to header
 8551        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8552            variant_id_tag,
 8553            ".",
 8554            "String",
 8555            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8556            "howard calculation",
 8557            "0",
 8558            self.code_type_map.get("String"),
 8559        )
 8560
 8561        # Update
 8562        sql_update = f"""
 8563            UPDATE {table_variants}
 8564            SET "INFO" = 
 8565                concat(
 8566                    CASE
 8567                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8568                        THEN ''
 8569                        ELSE concat("INFO", ';')
 8570                    END,
 8571                    '{variant_id_tag}=',
 8572                    "{variant_id_tag}"
 8573                )
 8574        """
 8575        self.conn.execute(sql_update)
 8576
 8577        # Remove added columns
 8578        for added_column in added_columns:
 8579            self.drop_column(column=added_column)
 8580
 8581    def calculation_extract_snpeff_hgvs(
 8582        self,
 8583        snpeff_hgvs: str = "snpeff_hgvs",
 8584        snpeff_field: str = "ANN",
 8585    ) -> None:
 8586        """
 8587        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8588        annotation field in a VCF file and adds them as a new column in the variants table.
 8589
 8590        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8591        function is used to specify the name of the column that will store the HGVS nomenclatures
 8592        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8593        snpeff_hgvs
 8594        :type snpeff_hgvs: str (optional)
 8595        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8596        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8597        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8598        to ANN
 8599        :type snpeff_field: str (optional)
 8600        """
 8601
 8602        # Snpeff hgvs tags
 8603        vcf_infos_tags = {
 8604            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8605        }
 8606
 8607        # Prefix
 8608        prefix = self.get_explode_infos_prefix()
 8609        if prefix:
 8610            prefix = "INFO/"
 8611
 8612        # snpEff fields
 8613        speff_ann_infos = prefix + snpeff_field
 8614        speff_hgvs_infos = prefix + snpeff_hgvs
 8615
 8616        # Variants table
 8617        table_variants = self.get_table_variants()
 8618
 8619        # Header
 8620        vcf_reader = self.get_header()
 8621
 8622        # Add columns
 8623        added_columns = []
 8624
 8625        # Explode HGVS field in column
 8626        added_columns += self.explode_infos(fields=[snpeff_field])
 8627
 8628        if snpeff_field in vcf_reader.infos:
 8629
 8630            log.debug(vcf_reader.infos[snpeff_field])
 8631
 8632            # Extract ANN header
 8633            ann_description = vcf_reader.infos[snpeff_field].desc
 8634            pattern = r"'(.+?)'"
 8635            match = re.search(pattern, ann_description)
 8636            if match:
 8637                ann_header_match = match.group(1).split(" | ")
 8638                ann_header_desc = {}
 8639                for i in range(len(ann_header_match)):
 8640                    ann_header_info = "".join(
 8641                        char for char in ann_header_match[i] if char.isalnum()
 8642                    )
 8643                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8644                if not ann_header_desc:
 8645                    raise ValueError("Invalid header description format")
 8646            else:
 8647                raise ValueError("Invalid header description format")
 8648
 8649            # Create variant id
 8650            variant_id_column = self.get_variant_id_column()
 8651            added_columns += [variant_id_column]
 8652
 8653            # Create dataframe
 8654            dataframe_snpeff_hgvs = self.get_query_to_df(
 8655                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8656            )
 8657
 8658            # Create main NOMEN column
 8659            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8660                speff_ann_infos
 8661            ].apply(
 8662                lambda x: extract_snpeff_hgvs(
 8663                    str(x), header=list(ann_header_desc.values())
 8664                )
 8665            )
 8666
 8667            # Add snpeff_hgvs to header
 8668            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8669                snpeff_hgvs,
 8670                ".",
 8671                "String",
 8672                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8673                "howard calculation",
 8674                "0",
 8675                self.code_type_map.get("String"),
 8676            )
 8677
 8678            # Update
 8679            sql_update = f"""
 8680                UPDATE variants
 8681                SET "INFO" = 
 8682                    concat(
 8683                        CASE
 8684                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8685                            THEN ''
 8686                            ELSE concat("INFO", ';')
 8687                        END,
 8688                        CASE 
 8689                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8690                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8691                            THEN concat(
 8692                                    '{snpeff_hgvs}=',
 8693                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8694                                )
 8695                            ELSE ''
 8696                        END
 8697                    )
 8698                FROM dataframe_snpeff_hgvs
 8699                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8700
 8701            """
 8702            self.conn.execute(sql_update)
 8703
 8704            # Delete dataframe
 8705            del dataframe_snpeff_hgvs
 8706            gc.collect()
 8707
 8708        else:
 8709
 8710            log.warning(
 8711                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8712            )
 8713
 8714        # Remove added columns
 8715        for added_column in added_columns:
 8716            self.drop_column(column=added_column)
 8717
 8718    def calculation_snpeff_ann_explode(
 8719        self,
 8720        uniquify: bool = True,
 8721        output_format: str = "fields",
 8722        output_prefix: str = "snpeff_",
 8723        snpeff_field: str = "ANN",
 8724    ) -> None:
 8725        """
 8726        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8727        exploding the HGVS field and updating variant information accordingly.
 8728
 8729        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8730        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8731        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8732        defaults to True
 8733        :type uniquify: bool (optional)
 8734        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8735        function specifies the format in which the output annotations will be generated. It has a
 8736        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8737        format, defaults to fields
 8738        :type output_format: str (optional)
 8739        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8740        method is used to specify the prefix that will be added to the output annotations generated
 8741        during the calculation process. This prefix helps to differentiate the newly added annotations
 8742        from existing ones in the output data. By default, the, defaults to ANN_
 8743        :type output_prefix: str (optional)
 8744        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8745        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8746        field will be processed to explode the HGVS annotations and update the variant information
 8747        accordingly, defaults to ANN
 8748        :type snpeff_field: str (optional)
 8749        """
 8750
 8751        # SnpEff annotation field
 8752        snpeff_hgvs = "snpeff_ann_explode"
 8753
 8754        # Snpeff hgvs tags
 8755        vcf_infos_tags = {
 8756            snpeff_hgvs: "Explode snpEff annotations",
 8757        }
 8758
 8759        # Prefix
 8760        prefix = self.get_explode_infos_prefix()
 8761        if prefix:
 8762            prefix = "INFO/"
 8763
 8764        # snpEff fields
 8765        speff_ann_infos = prefix + snpeff_field
 8766        speff_hgvs_infos = prefix + snpeff_hgvs
 8767
 8768        # Variants table
 8769        table_variants = self.get_table_variants()
 8770
 8771        # Header
 8772        vcf_reader = self.get_header()
 8773
 8774        # Add columns
 8775        added_columns = []
 8776
 8777        # Explode HGVS field in column
 8778        added_columns += self.explode_infos(fields=[snpeff_field])
 8779        log.debug(f"snpeff_field={snpeff_field}")
 8780        log.debug(f"added_columns={added_columns}")
 8781
 8782        if snpeff_field in vcf_reader.infos:
 8783
 8784            # Extract ANN header
 8785            ann_description = vcf_reader.infos[snpeff_field].desc
 8786            pattern = r"'(.+?)'"
 8787            match = re.search(pattern, ann_description)
 8788            if match:
 8789                ann_header_match = match.group(1).split(" | ")
 8790                ann_header = []
 8791                ann_header_desc = {}
 8792                for i in range(len(ann_header_match)):
 8793                    ann_header_info = "".join(
 8794                        char for char in ann_header_match[i] if char.isalnum()
 8795                    )
 8796                    ann_header.append(ann_header_info)
 8797                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8798                if not ann_header_desc:
 8799                    raise ValueError("Invalid header description format")
 8800            else:
 8801                raise ValueError("Invalid header description format")
 8802
 8803            # Create variant id
 8804            variant_id_column = self.get_variant_id_column()
 8805            added_columns += [variant_id_column]
 8806
 8807            # Create dataframe
 8808            dataframe_snpeff_hgvs = self.get_query_to_df(
 8809                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8810            )
 8811
 8812            # Create snpEff columns
 8813            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8814                speff_ann_infos
 8815            ].apply(
 8816                lambda x: explode_snpeff_ann(
 8817                    str(x),
 8818                    uniquify=uniquify,
 8819                    output_format=output_format,
 8820                    prefix=output_prefix,
 8821                    header=list(ann_header_desc.values()),
 8822                )
 8823            )
 8824
 8825            # Header
 8826            ann_annotations_prefix = ""
 8827            if output_format.upper() in ["JSON"]:
 8828                ann_annotations_prefix = f"{output_prefix}="
 8829                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8830                    output_prefix,
 8831                    ".",
 8832                    "String",
 8833                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8834                    + " - JSON format",
 8835                    "howard calculation",
 8836                    "0",
 8837                    self.code_type_map.get("String"),
 8838                )
 8839            else:
 8840                for ann_annotation in ann_header:
 8841                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8842                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8843                        ann_annotation_id,
 8844                        ".",
 8845                        "String",
 8846                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8847                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8848                        "howard calculation",
 8849                        "0",
 8850                        self.code_type_map.get("String"),
 8851                    )
 8852
 8853            # Update
 8854            sql_update = f"""
 8855                UPDATE variants
 8856                SET "INFO" = 
 8857                    concat(
 8858                        CASE
 8859                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8860                            THEN ''
 8861                            ELSE concat("INFO", ';')
 8862                        END,
 8863                        CASE 
 8864                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8865                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8866                            THEN concat(
 8867                                '{ann_annotations_prefix}',
 8868                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8869                                )
 8870                            ELSE ''
 8871                        END
 8872                    )
 8873                FROM dataframe_snpeff_hgvs
 8874                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8875
 8876            """
 8877            self.conn.execute(sql_update)
 8878
 8879            # Delete dataframe
 8880            del dataframe_snpeff_hgvs
 8881            gc.collect()
 8882
 8883        else:
 8884
 8885            log.warning(
 8886                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8887            )
 8888
 8889        # Remove added columns
 8890        for added_column in added_columns:
 8891            self.drop_column(column=added_column)
 8892
 8893    def calculation_extract_nomen(self) -> None:
 8894        """
 8895        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8896        """
 8897
 8898        # NOMEN field
 8899        field_nomen_dict = "NOMEN_DICT"
 8900
 8901        # NOMEN structure
 8902        nomen_dict = {
 8903            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8904            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8905            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8906            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8907            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8908            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8909            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8910            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8911            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8912            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8913        }
 8914
 8915        # Param
 8916        param = self.get_param()
 8917
 8918        # Threads
 8919        threads = self.get_threads()
 8920
 8921        # Prefix
 8922        prefix = self.get_explode_infos_prefix()
 8923
 8924        # Header
 8925        vcf_reader = self.get_header()
 8926
 8927        # Added columns
 8928        added_columns = []
 8929
 8930        # Get HGVS field
 8931        hgvs_field = (
 8932            param.get("calculation", {})
 8933            .get("calculations", {})
 8934            .get("NOMEN", {})
 8935            .get("options", {})
 8936            .get("hgvs_field", "hgvs")
 8937        )
 8938
 8939        # Get NOMEN pattern
 8940        nomen_pattern = (
 8941            param.get("calculation", {})
 8942            .get("calculations", {})
 8943            .get("NOMEN", {})
 8944            .get("options", {})
 8945            .get("pattern", None)
 8946        )
 8947
 8948        # transcripts list of preference sources
 8949        transcripts_sources = {}
 8950
 8951        # Get transcripts
 8952        transcripts_file = (
 8953            param.get("calculation", {})
 8954            .get("calculations", {})
 8955            .get("NOMEN", {})
 8956            .get("options", {})
 8957            .get("transcripts", None)
 8958        )
 8959        transcripts_file = full_path(transcripts_file)
 8960        if transcripts_file:
 8961            if os.path.exists(transcripts_file):
 8962                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8963                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8964                transcripts_sources["file"] = transcripts_from_file
 8965            else:
 8966                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8967                log.error(msg_err)
 8968                raise ValueError(msg_err)
 8969
 8970        # Get transcripts table
 8971        transcripts_table = (
 8972            param.get("calculation", {})
 8973            .get("calculations", {})
 8974            .get("NOMEN", {})
 8975            .get("options", {})
 8976            .get("transcripts_table", self.get_table_variants())
 8977        )
 8978        # Get transcripts column
 8979        transcripts_column = (
 8980            param.get("calculation", {})
 8981            .get("calculations", {})
 8982            .get("NOMEN", {})
 8983            .get("options", {})
 8984            .get("transcripts_column", None)
 8985        )
 8986
 8987        if transcripts_table and transcripts_column:
 8988            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8989            # Explode if not exists
 8990            added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table)
 8991        else:
 8992            extra_field_transcript = f"NULL"
 8993
 8994        # Transcripts of preference source order
 8995        transcripts_order = (
 8996            param.get("calculation", {})
 8997            .get("calculations", {})
 8998            .get("NOMEN", {})
 8999            .get("options", {})
 9000            .get("transcripts_order", ["column", "file"])
 9001        )
 9002
 9003        # Transcripts from file
 9004        transcripts = transcripts_sources.get("file", [])
 9005
 9006        # Explode HGVS field in column
 9007        added_columns += self.explode_infos(fields=[hgvs_field])
 9008
 9009        # extra infos
 9010        extra_infos = self.get_extra_infos()
 9011        extra_field = prefix + hgvs_field
 9012
 9013        if extra_field in extra_infos:
 9014
 9015            # Create dataframe
 9016            dataframe_hgvs = self.get_query_to_df(
 9017                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 9018            )
 9019
 9020            # Transcripts rank
 9021            transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)}
 9022            transcripts_len = len(transcripts_rank)
 9023
 9024            # Create main NOMEN column
 9025            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 9026                lambda x: find_nomen(
 9027                    hgvs=x.hgvs,
 9028                    transcript=x.transcript,
 9029                    transcripts=transcripts_rank,
 9030                    pattern=nomen_pattern,
 9031                    transcripts_source_order=transcripts_order,
 9032                    transcripts_len=transcripts_len
 9033                ),
 9034                axis=1,
 9035            )
 9036
 9037            # Explode NOMEN Structure and create SQL set for update
 9038            sql_nomen_fields = []
 9039            for nomen_field in nomen_dict:
 9040
 9041                # Create VCF header field
 9042                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9043                    nomen_field,
 9044                    ".",
 9045                    "String",
 9046                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9047                    "howard calculation",
 9048                    "0",
 9049                    self.code_type_map.get("String"),
 9050                )
 9051
 9052                # Add field to SQL query update
 9053                sql_nomen_fields.append(
 9054                    f"""
 9055                        CASE 
 9056                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
 9057                            THEN concat(
 9058                                    ';{nomen_field}=',
 9059                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
 9060                                )
 9061                            ELSE ''
 9062                        END
 9063                    """
 9064                )
 9065
 9066            # SQL set for update
 9067            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9068
 9069            # Update
 9070            sql_update = f"""
 9071                UPDATE variants
 9072                SET "INFO" = 
 9073                    concat(
 9074                        CASE
 9075                            WHEN "INFO" IS NULL
 9076                            THEN ''
 9077                            ELSE "INFO"
 9078                        END,
 9079                        {sql_nomen_fields_set}
 9080                    )
 9081                FROM dataframe_hgvs
 9082                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9083                    AND variants."POS" = dataframe_hgvs."POS" 
 9084                    AND variants."REF" = dataframe_hgvs."REF"
 9085                    AND variants."ALT" = dataframe_hgvs."ALT"
 9086            """
 9087            self.conn.execute(sql_update)
 9088
 9089            # Delete dataframe
 9090            del dataframe_hgvs
 9091            gc.collect()
 9092
 9093        # Remove added columns
 9094        for added_column in added_columns:
 9095            self.drop_column(column=added_column)
 9096
 9097    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9098        """
 9099        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9100        pipeline/sample for a variant and updates the variant information in a VCF file.
 9101
 9102        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9103        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9104        VCF header and to update the corresponding field in the variants table, defaults to
 9105        findbypipeline
 9106        :type tag: str (optional)
 9107        """
 9108
 9109        # if FORMAT and samples
 9110        if (
 9111            "FORMAT" in self.get_header_columns_as_list()
 9112            and self.get_header_sample_list()
 9113        ):
 9114
 9115            # findbypipeline annotation field
 9116            findbypipeline_tag = tag
 9117
 9118            # VCF infos tags
 9119            vcf_infos_tags = {
 9120                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9121            }
 9122
 9123            # Prefix
 9124            prefix = self.get_explode_infos_prefix()
 9125
 9126            # Field
 9127            findbypipeline_infos = prefix + findbypipeline_tag
 9128
 9129            # Variants table
 9130            table_variants = self.get_table_variants()
 9131
 9132            # Header
 9133            vcf_reader = self.get_header()
 9134
 9135            # Create variant id
 9136            variant_id_column = self.get_variant_id_column()
 9137            added_columns = [variant_id_column]
 9138
 9139            # variant_id, FORMAT and samples
 9140            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9141                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9142            )
 9143
 9144            # Create dataframe
 9145            dataframe_findbypipeline = self.get_query_to_df(
 9146                f""" SELECT {samples_fields} FROM {table_variants} """
 9147            )
 9148
 9149            # Create findbypipeline column
 9150            dataframe_findbypipeline[findbypipeline_infos] = (
 9151                dataframe_findbypipeline.apply(
 9152                    lambda row: findbypipeline(
 9153                        row, samples=self.get_header_sample_list()
 9154                    ),
 9155                    axis=1,
 9156                )
 9157            )
 9158
 9159            # Add snpeff_hgvs to header
 9160            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9161                findbypipeline_tag,
 9162                ".",
 9163                "String",
 9164                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9165                "howard calculation",
 9166                "0",
 9167                self.code_type_map.get("String"),
 9168            )
 9169
 9170            # Update
 9171            sql_update = f"""
 9172                UPDATE variants
 9173                SET "INFO" = 
 9174                    concat(
 9175                        CASE
 9176                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9177                            THEN ''
 9178                            ELSE concat("INFO", ';')
 9179                        END,
 9180                        CASE 
 9181                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9182                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9183                            THEN concat(
 9184                                    '{findbypipeline_tag}=',
 9185                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9186                                )
 9187                            ELSE ''
 9188                        END
 9189                    )
 9190                FROM dataframe_findbypipeline
 9191                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9192            """
 9193            self.conn.execute(sql_update)
 9194
 9195            # Remove added columns
 9196            for added_column in added_columns:
 9197                self.drop_column(column=added_column)
 9198
 9199            # Delete dataframe
 9200            del dataframe_findbypipeline
 9201            gc.collect()
 9202
 9203    def calculation_genotype_concordance(self) -> None:
 9204        """
 9205        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9206        multi-caller VCF files and updates the variant information in the database.
 9207        """
 9208
 9209        # if FORMAT and samples
 9210        if (
 9211            "FORMAT" in self.get_header_columns_as_list()
 9212            and self.get_header_sample_list()
 9213        ):
 9214
 9215            # genotypeconcordance annotation field
 9216            genotypeconcordance_tag = "genotypeconcordance"
 9217
 9218            # VCF infos tags
 9219            vcf_infos_tags = {
 9220                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9221            }
 9222
 9223            # Prefix
 9224            prefix = self.get_explode_infos_prefix()
 9225
 9226            # Field
 9227            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9228
 9229            # Variants table
 9230            table_variants = self.get_table_variants()
 9231
 9232            # Header
 9233            vcf_reader = self.get_header()
 9234
 9235            # Create variant id
 9236            variant_id_column = self.get_variant_id_column()
 9237            added_columns = [variant_id_column]
 9238
 9239            # variant_id, FORMAT and samples
 9240            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9241                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9242            )
 9243
 9244            # Create dataframe
 9245            dataframe_genotypeconcordance = self.get_query_to_df(
 9246                f""" SELECT {samples_fields} FROM {table_variants} """
 9247            )
 9248
 9249            # Create genotypeconcordance column
 9250            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9251                dataframe_genotypeconcordance.apply(
 9252                    lambda row: genotypeconcordance(
 9253                        row, samples=self.get_header_sample_list()
 9254                    ),
 9255                    axis=1,
 9256                )
 9257            )
 9258
 9259            # Add genotypeconcordance to header
 9260            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9261                genotypeconcordance_tag,
 9262                ".",
 9263                "String",
 9264                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9265                "howard calculation",
 9266                "0",
 9267                self.code_type_map.get("String"),
 9268            )
 9269
 9270            # Update
 9271            sql_update = f"""
 9272                UPDATE variants
 9273                SET "INFO" = 
 9274                    concat(
 9275                        CASE
 9276                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9277                            THEN ''
 9278                            ELSE concat("INFO", ';')
 9279                        END,
 9280                        CASE
 9281                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9282                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9283                            THEN concat(
 9284                                    '{genotypeconcordance_tag}=',
 9285                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9286                                )
 9287                            ELSE ''
 9288                        END
 9289                    )
 9290                FROM dataframe_genotypeconcordance
 9291                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9292            """
 9293            self.conn.execute(sql_update)
 9294
 9295            # Remove added columns
 9296            for added_column in added_columns:
 9297                self.drop_column(column=added_column)
 9298
 9299            # Delete dataframe
 9300            del dataframe_genotypeconcordance
 9301            gc.collect()
 9302
 9303    def calculation_barcode(self, tag: str = "barcode") -> None:
 9304        """
 9305        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9306        updates the INFO field in the file with the calculated barcode values.
 9307
 9308        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9309        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9310        the default tag name is set to "barcode", defaults to barcode
 9311        :type tag: str (optional)
 9312        """
 9313
 9314        # if FORMAT and samples
 9315        if (
 9316            "FORMAT" in self.get_header_columns_as_list()
 9317            and self.get_header_sample_list()
 9318        ):
 9319
 9320            # barcode annotation field
 9321            if not tag:
 9322                tag = "barcode"
 9323
 9324            # VCF infos tags
 9325            vcf_infos_tags = {
 9326                tag: "barcode calculation (VaRank)",
 9327            }
 9328
 9329            # Prefix
 9330            prefix = self.get_explode_infos_prefix()
 9331
 9332            # Field
 9333            barcode_infos = prefix + tag
 9334
 9335            # Variants table
 9336            table_variants = self.get_table_variants()
 9337
 9338            # Header
 9339            vcf_reader = self.get_header()
 9340
 9341            # Create variant id
 9342            variant_id_column = self.get_variant_id_column()
 9343            added_columns = [variant_id_column]
 9344
 9345            # variant_id, FORMAT and samples
 9346            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9347                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9348            )
 9349
 9350            # Create dataframe
 9351            dataframe_barcode = self.get_query_to_df(
 9352                f""" SELECT {samples_fields} FROM {table_variants} """
 9353            )
 9354
 9355            # Create barcode column
 9356            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9357                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9358            )
 9359
 9360            # Add barcode to header
 9361            vcf_reader.infos[tag] = vcf.parser._Info(
 9362                tag,
 9363                ".",
 9364                "String",
 9365                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9366                "howard calculation",
 9367                "0",
 9368                self.code_type_map.get("String"),
 9369            )
 9370
 9371            # Update
 9372            sql_update = f"""
 9373                UPDATE {table_variants}
 9374                SET "INFO" = 
 9375                    concat(
 9376                        CASE
 9377                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9378                            THEN ''
 9379                            ELSE concat("INFO", ';')
 9380                        END,
 9381                        CASE
 9382                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9383                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9384                            THEN concat(
 9385                                    '{tag}=',
 9386                                    dataframe_barcode."{barcode_infos}"
 9387                                )
 9388                            ELSE ''
 9389                        END
 9390                    )
 9391                FROM dataframe_barcode
 9392                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9393            """
 9394            self.conn.execute(sql_update)
 9395
 9396            # Remove added columns
 9397            for added_column in added_columns:
 9398                self.drop_column(column=added_column)
 9399
 9400            # Delete dataframe
 9401            del dataframe_barcode
 9402            gc.collect()
 9403
 9404    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9405        """
 9406        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9407        and updates the INFO field in the file with the calculated barcode values.
 9408
 9409        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9410        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9411        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9412        :type tag: str (optional)
 9413        """
 9414
 9415        # if FORMAT and samples
 9416        if (
 9417            "FORMAT" in self.get_header_columns_as_list()
 9418            and self.get_header_sample_list()
 9419        ):
 9420
 9421            # barcode annotation field
 9422            if not tag:
 9423                tag = "BCF"
 9424
 9425            # VCF infos tags
 9426            vcf_infos_tags = {
 9427                tag: "barcode family calculation",
 9428                f"{tag}S": "barcode family samples",
 9429            }
 9430
 9431            # Param
 9432            param = self.get_param()
 9433            log.debug(f"param={param}")
 9434
 9435            # Prefix
 9436            prefix = self.get_explode_infos_prefix()
 9437
 9438            # PED param
 9439            ped = (
 9440                param.get("calculation", {})
 9441                .get("calculations", {})
 9442                .get("BARCODEFAMILY", {})
 9443                .get("family_pedigree", None)
 9444            )
 9445            log.debug(f"ped={ped}")
 9446
 9447            # Load PED
 9448            if ped:
 9449
 9450                # Pedigree is a file
 9451                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9452                    log.debug("Pedigree is file")
 9453                    with open(full_path(ped)) as ped:
 9454                        ped = yaml.safe_load(ped)
 9455
 9456                # Pedigree is a string
 9457                elif isinstance(ped, str):
 9458                    log.debug("Pedigree is str")
 9459                    try:
 9460                        ped = json.loads(ped)
 9461                        log.debug("Pedigree is json str")
 9462                    except ValueError as e:
 9463                        ped_samples = ped.split(",")
 9464                        ped = {}
 9465                        for ped_sample in ped_samples:
 9466                            ped[ped_sample] = ped_sample
 9467
 9468                # Pedigree is a dict
 9469                elif isinstance(ped, dict):
 9470                    log.debug("Pedigree is dict")
 9471
 9472                # Pedigree is not well formatted
 9473                else:
 9474                    msg_error = "Pedigree not well formatted"
 9475                    log.error(msg_error)
 9476                    raise ValueError(msg_error)
 9477
 9478                # Construct list
 9479                ped_samples = list(ped.values())
 9480
 9481            else:
 9482                log.debug("Pedigree not defined. Take all samples")
 9483                ped_samples = self.get_header_sample_list()
 9484                ped = {}
 9485                for ped_sample in ped_samples:
 9486                    ped[ped_sample] = ped_sample
 9487
 9488            # Check pedigree
 9489            if not ped or len(ped) == 0:
 9490                msg_error = f"Error in pedigree: samples {ped_samples}"
 9491                log.error(msg_error)
 9492                raise ValueError(msg_error)
 9493
 9494            # Log
 9495            log.info(
 9496                "Calculation 'BARCODEFAMILY' - Samples: "
 9497                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9498            )
 9499            log.debug(f"ped_samples={ped_samples}")
 9500
 9501            # Field
 9502            barcode_infos = prefix + tag
 9503
 9504            # Variants table
 9505            table_variants = self.get_table_variants()
 9506
 9507            # Header
 9508            vcf_reader = self.get_header()
 9509
 9510            # Create variant id
 9511            variant_id_column = self.get_variant_id_column()
 9512            added_columns = [variant_id_column]
 9513
 9514            # variant_id, FORMAT and samples
 9515            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9516                [f""" "{sample}" """ for sample in ped_samples]
 9517            )
 9518
 9519            # Create dataframe
 9520            dataframe_barcode = self.get_query_to_df(
 9521                f""" SELECT {samples_fields} FROM {table_variants} """
 9522            )
 9523
 9524            # Create barcode column
 9525            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9526                lambda row: barcode(row, samples=ped_samples), axis=1
 9527            )
 9528
 9529            # Add barcode family to header
 9530            # Add vaf_normalization to header
 9531            vcf_reader.formats[tag] = vcf.parser._Format(
 9532                id=tag,
 9533                num=".",
 9534                type="String",
 9535                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9536                type_code=self.code_type_map.get("String"),
 9537            )
 9538            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9539                id=f"{tag}S",
 9540                num=".",
 9541                type="String",
 9542                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9543                type_code=self.code_type_map.get("String"),
 9544            )
 9545
 9546            # Update
 9547            # for sample in ped_samples:
 9548            sql_update_set = []
 9549            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9550                if sample in ped_samples:
 9551                    value = f'dataframe_barcode."{barcode_infos}"'
 9552                    value_samples = "'" + ",".join([f""" "{sample}" """ for sample in ped_samples]) + "'"
 9553                    ped_samples
 9554                elif sample == "FORMAT":
 9555                    value = f"'{tag}'"
 9556                    value_samples = f"'{tag}S'"
 9557                else:
 9558                    value = "'.'"
 9559                    value_samples = "'.'"
 9560                format_regex = r"[a-zA-Z0-9\s]"
 9561                sql_update_set.append(
 9562                    f"""
 9563                        "{sample}" = 
 9564                        concat(
 9565                            CASE
 9566                                WHEN {table_variants}."{sample}" = './.'
 9567                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9568                                ELSE {table_variants}."{sample}"
 9569                            END,
 9570                            ':',
 9571                            {value},
 9572                            ':',
 9573                            {value_samples}
 9574                        )
 9575                    """
 9576                )
 9577
 9578            sql_update_set_join = ", ".join(sql_update_set)
 9579            sql_update = f"""
 9580                UPDATE {table_variants}
 9581                SET {sql_update_set_join}
 9582                FROM dataframe_barcode
 9583                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9584            """
 9585            self.conn.execute(sql_update)
 9586
 9587            # Remove added columns
 9588            for added_column in added_columns:
 9589                self.drop_column(column=added_column)
 9590
 9591            # Delete dataframe
 9592            del dataframe_barcode
 9593            gc.collect()
 9594
 9595    def calculation_trio(self) -> None:
 9596        """
 9597        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9598        information to the INFO field of each variant.
 9599        """
 9600
 9601        # if FORMAT and samples
 9602        if (
 9603            "FORMAT" in self.get_header_columns_as_list()
 9604            and self.get_header_sample_list()
 9605        ):
 9606
 9607            # trio annotation field
 9608            trio_tag = "trio"
 9609
 9610            # VCF infos tags
 9611            vcf_infos_tags = {
 9612                "trio": "trio calculation",
 9613            }
 9614
 9615            # Param
 9616            param = self.get_param()
 9617
 9618            # Prefix
 9619            prefix = self.get_explode_infos_prefix()
 9620
 9621            # Trio param
 9622            trio_ped = (
 9623                param.get("calculation", {})
 9624                .get("calculations", {})
 9625                .get("TRIO", {})
 9626                .get("trio_pedigree", None)
 9627            )
 9628
 9629            # Load trio
 9630            if trio_ped:
 9631
 9632                # Trio pedigree is a file
 9633                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9634                    log.debug("TRIO pedigree is file")
 9635                    with open(full_path(trio_ped)) as trio_ped:
 9636                        trio_ped = yaml.safe_load(trio_ped)
 9637
 9638                # Trio pedigree is a string
 9639                elif isinstance(trio_ped, str):
 9640                    log.debug("TRIO pedigree is str")
 9641                    try:
 9642                        trio_ped = json.loads(trio_ped)
 9643                        log.debug("TRIO pedigree is json str")
 9644                    except ValueError as e:
 9645                        trio_samples = trio_ped.split(",")
 9646                        if len(trio_samples) == 3:
 9647                            trio_ped = {
 9648                                "father": trio_samples[0],
 9649                                "mother": trio_samples[1],
 9650                                "child": trio_samples[2],
 9651                            }
 9652                            log.debug("TRIO pedigree is list str")
 9653                        else:
 9654                            msg_error = "TRIO pedigree not well formatted"
 9655                            log.error(msg_error)
 9656                            raise ValueError(msg_error)
 9657
 9658                # Trio pedigree is a dict
 9659                elif isinstance(trio_ped, dict):
 9660                    log.debug("TRIO pedigree is dict")
 9661
 9662                # Trio pedigree is not well formatted
 9663                else:
 9664                    msg_error = "TRIO pedigree not well formatted"
 9665                    log.error(msg_error)
 9666                    raise ValueError(msg_error)
 9667
 9668                # Construct trio list
 9669                trio_samples = [
 9670                    trio_ped.get("father", ""),
 9671                    trio_ped.get("mother", ""),
 9672                    trio_ped.get("child", ""),
 9673                ]
 9674
 9675            else:
 9676                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9677                samples_list = self.get_header_sample_list()
 9678                if len(samples_list) >= 3:
 9679                    trio_samples = self.get_header_sample_list()[0:3]
 9680                    trio_ped = {
 9681                        "father": trio_samples[0],
 9682                        "mother": trio_samples[1],
 9683                        "child": trio_samples[2],
 9684                    }
 9685                else:
 9686                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9687                    log.error(msg_error)
 9688                    raise ValueError(msg_error)
 9689
 9690            # Check trio pedigree
 9691            if not trio_ped or len(trio_ped) != 3:
 9692                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9693                log.error(msg_error)
 9694                raise ValueError(msg_error)
 9695
 9696            # Log
 9697            log.info(
 9698                f"Calculation 'TRIO' - Samples: "
 9699                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9700            )
 9701
 9702            # Field
 9703            trio_infos = prefix + trio_tag
 9704
 9705            # Variants table
 9706            table_variants = self.get_table_variants()
 9707
 9708            # Header
 9709            vcf_reader = self.get_header()
 9710
 9711            # Create variant id
 9712            variant_id_column = self.get_variant_id_column()
 9713            added_columns = [variant_id_column]
 9714
 9715            # variant_id, FORMAT and samples
 9716            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9717                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9718            )
 9719
 9720            # Create dataframe
 9721            dataframe_trio = self.get_query_to_df(
 9722                f""" SELECT {samples_fields} FROM {table_variants} """
 9723            )
 9724
 9725            # Create trio column
 9726            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9727                lambda row: trio(row, samples=trio_samples), axis=1
 9728            )
 9729
 9730            # Add trio to header
 9731            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9732                trio_tag,
 9733                ".",
 9734                "String",
 9735                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9736                "howard calculation",
 9737                "0",
 9738                self.code_type_map.get("String"),
 9739            )
 9740
 9741            # Update
 9742            sql_update = f"""
 9743                UPDATE {table_variants}
 9744                SET "INFO" = 
 9745                    concat(
 9746                        CASE
 9747                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9748                            THEN ''
 9749                            ELSE concat("INFO", ';')
 9750                        END,
 9751                        CASE
 9752                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9753                             AND dataframe_trio."{trio_infos}" NOT NULL
 9754                            THEN concat(
 9755                                    '{trio_tag}=',
 9756                                    dataframe_trio."{trio_infos}"
 9757                                )
 9758                            ELSE ''
 9759                        END
 9760                    )
 9761                FROM dataframe_trio
 9762                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9763            """
 9764            self.conn.execute(sql_update)
 9765
 9766            # Remove added columns
 9767            for added_column in added_columns:
 9768                self.drop_column(column=added_column)
 9769
 9770            # Delete dataframe
 9771            del dataframe_trio
 9772            gc.collect()
 9773
 9774    def calculation_vaf_normalization(self) -> None:
 9775        """
 9776        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9777        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9778        :return: The function does not return anything.
 9779        """
 9780
 9781        # if FORMAT and samples
 9782        if (
 9783            "FORMAT" in self.get_header_columns_as_list()
 9784            and self.get_header_sample_list()
 9785        ):
 9786
 9787            # vaf_normalization annotation field
 9788            vaf_normalization_tag = "VAF"
 9789
 9790            # VCF infos tags
 9791            vcf_infos_tags = {
 9792                "VAF": "VAF Variant Frequency",
 9793            }
 9794
 9795            # Prefix
 9796            prefix = self.get_explode_infos_prefix()
 9797
 9798            # Variants table
 9799            table_variants = self.get_table_variants()
 9800
 9801            # Header
 9802            vcf_reader = self.get_header()
 9803
 9804            # Do not calculate if VAF already exists
 9805            if "VAF" in vcf_reader.formats:
 9806                log.debug("VAF already on genotypes")
 9807                return
 9808
 9809            # Create variant id
 9810            variant_id_column = self.get_variant_id_column()
 9811            added_columns = [variant_id_column]
 9812
 9813            # variant_id, FORMAT and samples
 9814            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9815                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9816            )
 9817
 9818            # Create dataframe
 9819            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9820            log.debug(f"query={query}")
 9821            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9822
 9823            vaf_normalization_set = []
 9824
 9825            # for each sample vaf_normalization
 9826            for sample in self.get_header_sample_list():
 9827                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9828                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9829                )
 9830                vaf_normalization_set.append(
 9831                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9832                )
 9833
 9834            # Add VAF to FORMAT
 9835            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9836                "FORMAT"
 9837            ].apply(lambda x: str(x) + ":VAF")
 9838            vaf_normalization_set.append(
 9839                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9840            )
 9841
 9842            # Add vaf_normalization to header
 9843            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9844                id=vaf_normalization_tag,
 9845                num="1",
 9846                type="Float",
 9847                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9848                type_code=self.code_type_map.get("Float"),
 9849            )
 9850
 9851            # Create fields to add in INFO
 9852            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9853
 9854            # Update
 9855            sql_update = f"""
 9856                UPDATE {table_variants}
 9857                SET {sql_vaf_normalization_set}
 9858                FROM dataframe_vaf_normalization
 9859                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9860
 9861            """
 9862            self.conn.execute(sql_update)
 9863
 9864            # Remove added columns
 9865            for added_column in added_columns:
 9866                self.drop_column(column=added_column)
 9867
 9868            # Delete dataframe
 9869            del dataframe_vaf_normalization
 9870            gc.collect()
 9871
 9872    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9873        """
 9874        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9875        field in a VCF file and updates the INFO column of the variants table with the calculated
 9876        statistics.
 9877
 9878        :param info: The `info` parameter is a string that represents the type of information for which
 9879        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9880        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9881        maximum value, the mean, the median, defaults to VAF
 9882        :type info: str (optional)
 9883        """
 9884
 9885        # if FORMAT and samples
 9886        if (
 9887            "FORMAT" in self.get_header_columns_as_list()
 9888            and self.get_header_sample_list()
 9889        ):
 9890
 9891            # vaf_stats annotation field
 9892            vaf_stats_tag = info + "_stats"
 9893
 9894            # VCF infos tags
 9895            vcf_infos_tags = {
 9896                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9897                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9898                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9899                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9900                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9901                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9902                info
 9903                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9904            }
 9905
 9906            # Prefix
 9907            prefix = self.get_explode_infos_prefix()
 9908
 9909            # Field
 9910            vaf_stats_infos = prefix + vaf_stats_tag
 9911
 9912            # Variants table
 9913            table_variants = self.get_table_variants()
 9914
 9915            # Header
 9916            vcf_reader = self.get_header()
 9917
 9918            # Create variant id
 9919            variant_id_column = self.get_variant_id_column()
 9920            added_columns = [variant_id_column]
 9921
 9922            # variant_id, FORMAT and samples
 9923            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9924                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9925            )
 9926
 9927            # Create dataframe
 9928            dataframe_vaf_stats = self.get_query_to_df(
 9929                f""" SELECT {samples_fields} FROM {table_variants} """
 9930            )
 9931
 9932            # Create vaf_stats column
 9933            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9934                lambda row: genotype_stats(
 9935                    row, samples=self.get_header_sample_list(), info=info
 9936                ),
 9937                axis=1,
 9938            )
 9939
 9940            # List of vcf tags
 9941            sql_vaf_stats_fields = []
 9942
 9943            # Check all VAF stats infos
 9944            for stat in vcf_infos_tags:
 9945
 9946                # Extract stats
 9947                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9948                    lambda x: dict(x).get(stat, "")
 9949                )
 9950
 9951                # Add snpeff_hgvs to header
 9952                vcf_reader.infos[stat] = vcf.parser._Info(
 9953                    stat,
 9954                    ".",
 9955                    "String",
 9956                    vcf_infos_tags.get(stat, "genotype statistics"),
 9957                    "howard calculation",
 9958                    "0",
 9959                    self.code_type_map.get("String"),
 9960                )
 9961
 9962                if len(sql_vaf_stats_fields):
 9963                    sep = ";"
 9964                else:
 9965                    sep = ""
 9966
 9967                # Create fields to add in INFO
 9968                sql_vaf_stats_fields.append(
 9969                    f"""
 9970                        CASE
 9971                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9972                            THEN concat(
 9973                                    '{sep}{stat}=',
 9974                                    dataframe_vaf_stats."{stat}"
 9975                                )
 9976                            ELSE ''
 9977                        END
 9978                    """
 9979                )
 9980
 9981            # SQL set for update
 9982            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9983
 9984            # Update
 9985            sql_update = f"""
 9986                UPDATE {table_variants}
 9987                SET "INFO" = 
 9988                    concat(
 9989                        CASE
 9990                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9991                            THEN ''
 9992                            ELSE concat("INFO", ';')
 9993                        END,
 9994                        {sql_vaf_stats_fields_set}
 9995                    )
 9996                FROM dataframe_vaf_stats
 9997                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9998
 9999            """
10000            self.conn.execute(sql_update)
10001
10002            # Remove added columns
10003            for added_column in added_columns:
10004                self.drop_column(column=added_column)
10005
10006            # Delete dataframe
10007            del dataframe_vaf_stats
10008            gc.collect()
10009
10010    def calculation_transcripts_annotation(
10011        self, info_json: str = None, info_format: str = None
10012    ) -> None:
10013        """
10014        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10015        field to it if transcripts are available.
10016
10017        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10018        is a string parameter that represents the information field to be used in the transcripts JSON.
10019        It is used to specify the JSON format for the transcripts information. If no value is provided
10020        when calling the method, it defaults to "
10021        :type info_json: str
10022        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10023        method is a string parameter that specifies the format of the information field to be used in
10024        the transcripts JSON. It is used to define the format of the information field
10025        :type info_format: str
10026        """
10027
10028        # Create transcripts table
10029        transcripts_table = self.create_transcript_view()
10030
10031        # Add info field
10032        if transcripts_table:
10033            self.transcript_view_to_variants(
10034                transcripts_table=transcripts_table,
10035                transcripts_info_field_json=info_json,
10036                transcripts_info_field_format=info_format,
10037            )
10038        else:
10039            log.info("No Transcripts to process. Check param.json file configuration")
10040
10041    def calculation_transcripts_prioritization(self) -> None:
10042        """
10043        The function `calculation_transcripts_prioritization` creates a transcripts table and
10044        prioritizes transcripts based on certain criteria.
10045        """
10046
10047        # Create transcripts table
10048        transcripts_table = self.create_transcript_view()
10049
10050        # Add info field
10051        if transcripts_table:
10052            self.transcripts_prioritization(transcripts_table=transcripts_table)
10053        else:
10054            log.info("No Transcripts to process. Check param.json file configuration")
10055
10056    def calculation_transcripts_export(self) -> None:
10057        """ """
10058
10059        # Create transcripts table
10060        transcripts_table = self.create_transcript_view()
10061
10062        # Add info field
10063        if transcripts_table:
10064            self.transcripts_export(transcripts_table=transcripts_table)
10065        else:
10066            log.info("No Transcripts to process. Check param.json file configuration")
10067
10068    ###############
10069    # Transcripts #
10070    ###############
10071
10072    def transcripts_export(
10073        self, transcripts_table: str = None, param: dict = {}
10074    ) -> bool:
10075        """ """
10076
10077        log.debug("Start transcripts export...")
10078
10079        # Param
10080        if not param:
10081            param = self.get_param()
10082
10083        # Param export
10084        param_transcript_export = param.get("transcripts", {}).get("export", {})
10085
10086        # Output file
10087        transcripts_export_output = param_transcript_export.get("output", None)
10088
10089        if not param_transcript_export or not transcripts_export_output:
10090            log.warning(f"No transcriipts export parameters defined!")
10091            return False
10092
10093        # List of transcripts annotations
10094        query_describe = f"""
10095            SELECT column_name
10096            FROM (
10097                    DESCRIBE SELECT * FROM {transcripts_table}
10098                )
10099            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10100        """
10101        transcripts_annotations_list = list(
10102            self.get_query_to_df(query=query_describe)["column_name"]
10103        )
10104
10105        # Create transcripts table for export
10106        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10107            random.choices(string.ascii_uppercase + string.digits, k=10)
10108        )
10109        query_create_transcripts_table_export = f"""
10110            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10111        """
10112        self.execute_query(query=query_create_transcripts_table_export)
10113
10114        # Output file format
10115        transcripts_export_output_format = get_file_format(
10116            filename=transcripts_export_output
10117        )
10118
10119        # Format VCF - construct INFO
10120        if transcripts_export_output_format in ["vcf"]:
10121
10122            # Construct query update INFO and header
10123            query_update_info = []
10124            for field in transcripts_annotations_list:
10125
10126                # If field not in header
10127                if field not in self.get_header_infos_list():
10128
10129                    # Add PZ Transcript in header
10130                    self.get_header().infos[field] = vcf.parser._Info(
10131                        field,
10132                        ".",
10133                        "String",
10134                        f"Annotation '{field}' from transcript view",
10135                        "unknown",
10136                        "unknown",
10137                        0,
10138                    )
10139
10140                # Add field as INFO/tag
10141                query_update_info.append(
10142                    f"""
10143                        CASE
10144                            WHEN "{field}" IS NOT NULL
10145                            THEN concat('{field}=', "{field}", ';')    
10146                            ELSE ''     
10147                        END
10148                        """
10149                )
10150
10151            # Query param
10152            query_update_info_value = (
10153                f""" concat('',  {", ".join(query_update_info)}) """
10154            )
10155            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10156
10157        else:
10158
10159            # Query param
10160            query_update_info_value = f""" NULL """
10161            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10162
10163        # Update query INFO column
10164        query_update = f"""
10165            UPDATE {transcripts_table_export}
10166            SET INFO = {query_update_info_value}
10167
10168        """
10169        self.execute_query(query=query_update)
10170
10171        # Export
10172        self.export_output(
10173            output_file=transcripts_export_output,
10174            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10175        )
10176
10177        # Drop transcripts export table
10178        query_drop_transcripts_table_export = f"""
10179            DROP TABLE {transcripts_table_export}
10180        """
10181        self.execute_query(query=query_drop_transcripts_table_export)
10182
10183    def transcripts_prioritization(
10184        self, transcripts_table: str = None, param: dict = {}
10185    ) -> bool:
10186        """
10187        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10188        and updates the variants table with the prioritized information.
10189
10190        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10191        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10192        This parameter is used to identify the table where the transcripts data is stored for the
10193        prioritization process
10194        :type transcripts_table: str
10195        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10196        that contains various configuration settings for the prioritization process of transcripts. It
10197        is used to customize the behavior of the prioritization algorithm and includes settings such as
10198        the prefix for prioritization fields, default profiles, and other
10199        :type param: dict
10200        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10201        transcripts prioritization process is successfully completed, and `False` if there are any
10202        issues or if no profile is defined for transcripts prioritization.
10203        """
10204
10205        log.debug("Start transcripts prioritization...")
10206
10207        # Param
10208        if not param:
10209            param = self.get_param()
10210
10211        # Variants table
10212        table_variants = self.get_table_variants()
10213
10214        # Transcripts table
10215        if transcripts_table is None:
10216            transcripts_table = self.create_transcript_view(
10217                transcripts_table="transcripts", param=param
10218            )
10219        if transcripts_table is None:
10220            msg_err = "No Transcripts table availalble"
10221            log.error(msg_err)
10222            raise ValueError(msg_err)
10223        log.debug(f"transcripts_table={transcripts_table}")
10224
10225        # Get transcripts columns
10226        columns_as_list_query = f"""
10227            DESCRIBE {transcripts_table}
10228        """
10229        columns_as_list = list(
10230            self.get_query_to_df(columns_as_list_query)["column_name"]
10231        )
10232
10233        # Create INFO if not exists
10234        if "INFO" not in columns_as_list:
10235            query_add_info = f"""
10236                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10237            """
10238            self.execute_query(query_add_info)
10239
10240        # Prioritization param and Force only PZ Score and Flag
10241        pz_param = param.get("transcripts", {}).get("prioritization", {})
10242
10243        # PZ profile by default
10244        pz_profile_default = (
10245            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10246        )
10247
10248        # Exit if no profile
10249        if pz_profile_default is None:
10250            log.warning("No profile defined for transcripts prioritization")
10251            return False
10252
10253        # PZ fields
10254        pz_param_pzfields = {}
10255
10256        # PZ field transcripts
10257        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10258
10259        # Add PZ Transcript in header
10260        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10261            pz_fields_transcripts,
10262            ".",
10263            "String",
10264            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10265            "unknown",
10266            "unknown",
10267            code_type_map["String"],
10268        )
10269
10270        # Mandatory fields
10271        pz_mandatory_fields_list = [
10272            "Score",
10273            "Flag",
10274            "Tags",
10275            "Comment",
10276            "Infos",
10277            "Class",
10278        ]
10279        pz_mandatory_fields = []
10280        for pz_mandatory_field in pz_mandatory_fields_list:
10281            pz_mandatory_fields.append(
10282                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10283            )
10284
10285        # PZ fields in param
10286        for pz_field in pz_param.get("pzfields", []):
10287            if pz_field in pz_mandatory_fields_list:
10288                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10289                    pz_param.get("pzprefix", "PTZ") + pz_field
10290                )
10291            else:
10292                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10293                pz_param_pzfields[pz_field] = pz_field_new
10294
10295                # Add PZ Transcript in header
10296                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10297                    pz_field_new,
10298                    ".",
10299                    "String",
10300                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10301                    "unknown",
10302                    "unknown",
10303                    code_type_map["String"],
10304                )
10305
10306        # PZ fields param
10307        pz_param["pzfields"] = pz_mandatory_fields
10308
10309        # Prioritization
10310        prioritization_result = self.prioritization(
10311            table=transcripts_table,
10312            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10313        )
10314        if not prioritization_result:
10315            log.warning("Transcripts prioritization not processed")
10316            return False
10317
10318        # PZ fields sql query
10319        query_update_select_list = []
10320        query_update_concat_list = []
10321        query_update_order_list = []
10322        for pz_param_pzfield in set(
10323            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10324        ):
10325            query_update_select_list.append(f" {pz_param_pzfield}, ")
10326
10327        for pz_param_pzfield in pz_param_pzfields:
10328            query_update_concat_list.append(
10329                f"""
10330                    , CASE 
10331                        WHEN {pz_param_pzfield} IS NOT NULL
10332                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10333                        ELSE ''
10334                    END
10335                """
10336            )
10337
10338        # Order by
10339        pz_orders = (
10340            param.get("transcripts", {})
10341            .get("prioritization", {})
10342            .get("prioritization_transcripts_order", {})
10343        )
10344        if not pz_orders:
10345            pz_orders = {
10346                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10347                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10348            }
10349        for pz_order in pz_orders:
10350            query_update_order_list.append(
10351                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10352            )
10353
10354        # Fields to explode
10355        fields_to_explode = (
10356            list(pz_param_pzfields.keys())
10357            + pz_mandatory_fields
10358            + list(pz_orders.keys())
10359        )
10360        # Remove transcript column as a specific transcript column
10361        if "transcript" in fields_to_explode:
10362            fields_to_explode.remove("transcript")
10363
10364        # Fields intranscripts table
10365        query_transcripts_table = f"""
10366            DESCRIBE SELECT * FROM {transcripts_table}
10367        """
10368        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10369
10370        # Check fields to explode
10371        for field_to_explode in fields_to_explode:
10372            if field_to_explode not in self.get_header_infos_list() + list(
10373                query_transcripts_table.column_name
10374            ):
10375                msg_err = f"INFO/{field_to_explode} NOT IN header"
10376                log.error(msg_err)
10377                raise ValueError(msg_err)
10378
10379        # Explode fields to explode
10380        self.explode_infos(
10381            table=transcripts_table,
10382            fields=fields_to_explode,
10383        )
10384
10385        # Transcript preference file
10386        transcripts_preference_file = (
10387            param.get("transcripts", {})
10388            .get("prioritization", {})
10389            .get("prioritization_transcripts", {})
10390        )
10391        transcripts_preference_file = full_path(transcripts_preference_file)
10392
10393        # Transcript preference forced
10394        transcript_preference_force = (
10395            param.get("transcripts", {})
10396            .get("prioritization", {})
10397            .get("prioritization_transcripts_force", False)
10398        )
10399        # Transcript version forced
10400        transcript_version_force = (
10401            param.get("transcripts", {})
10402            .get("prioritization", {})
10403            .get("prioritization_transcripts_version_force", False)
10404        )
10405
10406        # Transcripts Ranking
10407        if transcripts_preference_file:
10408
10409            # Transcripts file to dataframe
10410            if os.path.exists(transcripts_preference_file):
10411                transcripts_preference_dataframe = transcripts_file_to_df(
10412                    transcripts_preference_file
10413                )
10414            else:
10415                log.error(
10416                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10417                )
10418                raise ValueError(
10419                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10420                )
10421
10422            # Order by depending to transcript preference forcing
10423            if transcript_preference_force:
10424                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10425            else:
10426                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10427
10428            # Transcript columns joined depend on version consideration
10429            if transcript_version_force:
10430                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10431            else:
10432                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10433
10434            # Query ranking for update
10435            query_update_ranking = f"""
10436                SELECT
10437                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10438                    ROW_NUMBER() OVER (
10439                        PARTITION BY "#CHROM", POS, REF, ALT
10440                        ORDER BY {order_by}
10441                    ) AS rn
10442                FROM {transcripts_table}
10443                LEFT JOIN 
10444                    (
10445                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10446                        FROM transcripts_preference_dataframe
10447                    ) AS transcripts_preference
10448                ON {transcripts_version_join}
10449            """
10450
10451        else:
10452
10453            # Query ranking for update
10454            query_update_ranking = f"""
10455                SELECT
10456                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10457                    ROW_NUMBER() OVER (
10458                        PARTITION BY "#CHROM", POS, REF, ALT
10459                        ORDER BY {" , ".join(query_update_order_list)}
10460                    ) AS rn
10461                FROM {transcripts_table}
10462            """
10463
10464        # Export Transcripts prioritization infos to variants table
10465        query_update = f"""
10466            WITH RankedTranscripts AS (
10467                {query_update_ranking}
10468            )
10469            UPDATE {table_variants}
10470                SET
10471                INFO = CONCAT(CASE
10472                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10473                            THEN ''
10474                            ELSE concat("INFO", ';')
10475                        END,
10476                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10477                        )
10478            FROM
10479                RankedTranscripts
10480            WHERE
10481                rn = 1
10482                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10483                AND variants."POS" = RankedTranscripts."POS"
10484                AND variants."REF" = RankedTranscripts."REF"
10485                AND variants."ALT" = RankedTranscripts."ALT"     
10486        """
10487
10488        # log.debug(f"query_update={query_update}")
10489        self.execute_query(query=query_update)
10490
10491        # Return
10492        return True
10493
10494    def create_transcript_view_from_columns_map(
10495        self,
10496        transcripts_table: str = "transcripts",
10497        columns_maps: dict = {},
10498        added_columns: list = [],
10499        temporary_tables: list = None,
10500        annotation_fields: list = None,
10501        column_rename: dict = {},
10502        column_clean: bool = False,
10503        column_case: str = None,
10504    ) -> tuple[list, list, list]:
10505        """
10506        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10507        specified columns mapping for transcripts data.
10508
10509        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10510        of the table where the transcripts data is stored or will be stored in the database. This table
10511        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10512        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10513        :type transcripts_table: str (optional)
10514        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10515        about how to map columns from a transcripts table to create a view. Each entry in the
10516        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10517        typically includes details such as the main transcript column and additional information columns
10518        :type columns_maps: dict
10519        :param added_columns: The `added_columns` parameter in the
10520        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10521        that will be added to the view being created based on the columns map provided. These columns
10522        are generated by exploding the transcript information columns along with the main transcript
10523        column
10524        :type added_columns: list
10525        :param temporary_tables: The `temporary_tables` parameter in the
10526        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10527        tables created during the process of creating a transcript view from a columns map. These
10528        temporary tables are used to store intermediate results or transformations before the final view
10529        is generated
10530        :type temporary_tables: list
10531        :param annotation_fields: The `annotation_fields` parameter in the
10532        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10533        used for annotation in the query view creation process. These fields are extracted from the
10534        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10535        :type annotation_fields: list
10536        :param column_rename: The `column_rename` parameter in the
10537        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10538        custom renaming for columns during the creation of the temporary table view. This parameter
10539        provides a mapping of original column names to the desired renamed column names. By using this
10540        parameter,
10541        :type column_rename: dict
10542        :param column_clean: The `column_clean` parameter in the
10543        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10544        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10545        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10546        False
10547        :type column_clean: bool (optional)
10548        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10549        function is used to specify the case transformation to be applied to the columns during the view
10550        creation process. It allows you to control whether the column values should be converted to
10551        lowercase, uppercase, or remain unchanged
10552        :type column_case: str
10553        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10554        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10555        """
10556
10557        log.debug("Start transcrpts view creation from columns map...")
10558
10559        # "from_columns_map": [
10560        #     {
10561        #         "transcripts_column": "Ensembl_transcriptid",
10562        #         "transcripts_infos_columns": [
10563        #             "genename",
10564        #             "Ensembl_geneid",
10565        #             "LIST_S2_score",
10566        #             "LIST_S2_pred",
10567        #         ],
10568        #     },
10569        #     {
10570        #         "transcripts_column": "Ensembl_transcriptid",
10571        #         "transcripts_infos_columns": [
10572        #             "genename",
10573        #             "VARITY_R_score",
10574        #             "Aloft_pred",
10575        #         ],
10576        #     },
10577        # ],
10578
10579        # Init
10580        if temporary_tables is None:
10581            temporary_tables = []
10582        if annotation_fields is None:
10583            annotation_fields = []
10584
10585        # Variants table
10586        table_variants = self.get_table_variants()
10587
10588        for columns_map in columns_maps:
10589
10590            # Transcript column
10591            transcripts_column = columns_map.get("transcripts_column", None)
10592
10593            # Transcripts infos columns
10594            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10595
10596            # Transcripts infos columns rename
10597            column_rename = columns_map.get("column_rename", column_rename)
10598
10599            # Transcripts infos columns clean
10600            column_clean = columns_map.get("column_clean", column_clean)
10601
10602            # Transcripts infos columns case
10603            column_case = columns_map.get("column_case", column_case)
10604
10605            if transcripts_column is not None:
10606
10607                # Explode
10608                added_columns += self.explode_infos(
10609                    fields=[transcripts_column] + transcripts_infos_columns
10610                )
10611
10612                # View clauses
10613                clause_select_variants = []
10614                clause_select_tanscripts = []
10615                for field in [transcripts_column] + transcripts_infos_columns:
10616
10617                    # AS field
10618                    as_field = field
10619
10620                    # Rename
10621                    if column_rename:
10622                        as_field = column_rename.get(as_field, as_field)
10623
10624                    # Clean
10625                    if column_clean:
10626                        as_field = clean_annotation_field(as_field)
10627
10628                    # Case
10629                    if column_case:
10630                        if column_case.lower() in ["lower"]:
10631                            as_field = as_field.lower()
10632                        elif column_case.lower() in ["upper"]:
10633                            as_field = as_field.upper()
10634
10635                    # Clause select Variants
10636                    clause_select_variants.append(
10637                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10638                    )
10639
10640                    if field in [transcripts_column]:
10641                        clause_select_tanscripts.append(
10642                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10643                        )
10644                    else:
10645                        clause_select_tanscripts.append(
10646                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10647                        )
10648                        annotation_fields.append(as_field)
10649
10650                # Querey View
10651                query = f""" 
10652                    SELECT
10653                        "#CHROM", POS, REF, ALT, INFO,
10654                        "{transcripts_column}" AS 'transcript',
10655                        {", ".join(clause_select_tanscripts)}
10656                    FROM (
10657                        SELECT 
10658                            "#CHROM", POS, REF, ALT, INFO,
10659                            {", ".join(clause_select_variants)}
10660                        FROM {table_variants}
10661                        )
10662                    WHERE "{transcripts_column}" IS NOT NULL
10663                """
10664
10665                # Create temporary table
10666                temporary_table = transcripts_table + "".join(
10667                    random.choices(string.ascii_uppercase + string.digits, k=10)
10668                )
10669
10670                # Temporary_tables
10671                temporary_tables.append(temporary_table)
10672                query_view = f"""
10673                    CREATE TEMPORARY TABLE {temporary_table}
10674                    AS ({query})
10675                """
10676                self.execute_query(query=query_view)
10677
10678        return added_columns, temporary_tables, annotation_fields
10679
10680    def create_transcript_view_from_column_format(
10681        self,
10682        transcripts_table: str = "transcripts",
10683        column_formats: dict = {},
10684        temporary_tables: list = None,
10685        annotation_fields: list = None,
10686        column_rename: dict = {},
10687        column_clean: bool = False,
10688        column_case: str = None,
10689    ) -> tuple[list, list, list]:
10690        """
10691        The `create_transcript_view_from_column_format` function generates a transcript view based on
10692        specified column formats, adds additional columns and annotation fields, and returns the list of
10693        temporary tables and annotation fields.
10694
10695        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10696        of the table containing the transcripts data. This table will be used as the base table for
10697        creating the transcript view. The default value for this parameter is "transcripts", but you can
10698        provide a different table name if needed, defaults to transcripts
10699        :type transcripts_table: str (optional)
10700        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10701        about the columns to be used for creating the transcript view. Each entry in the dictionary
10702        specifies the mapping between a transcripts column and a transcripts infos column. This
10703        parameter allows you to define how the columns from the transcripts table should be transformed
10704        or mapped
10705        :type column_formats: dict
10706        :param temporary_tables: The `temporary_tables` parameter in the
10707        `create_transcript_view_from_column_format` function is a list that stores the names of
10708        temporary views created during the process of creating a transcript view from a column format.
10709        These temporary views are used to manipulate and extract data before generating the final
10710        transcript view
10711        :type temporary_tables: list
10712        :param annotation_fields: The `annotation_fields` parameter in the
10713        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10714        that are extracted from the temporary views created during the process. These annotation fields
10715        are obtained by querying the temporary views and extracting the column names excluding specific
10716        columns like `#CH
10717        :type annotation_fields: list
10718        :param column_rename: The `column_rename` parameter in the
10719        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10720        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10721        column names to new column names in this dictionary, you can rename specific columns during the
10722        process
10723        :type column_rename: dict
10724        :param column_clean: The `column_clean` parameter in the
10725        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10726        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10727        will be cleaned during the creation of the transcript view based on the specified column format,
10728        defaults to False
10729        :type column_clean: bool (optional)
10730        :param column_case: The `column_case` parameter in the
10731        `create_transcript_view_from_column_format` function is used to specify the case transformation
10732        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10733        to convert the column names to uppercase or lowercase, respectively
10734        :type column_case: str
10735        :return: The `create_transcript_view_from_column_format` function returns two lists:
10736        `temporary_tables` and `annotation_fields`.
10737        """
10738
10739        log.debug("Start transcrpts view creation from column format...")
10740
10741        #  "from_column_format": [
10742        #     {
10743        #         "transcripts_column": "ANN",
10744        #         "transcripts_infos_column": "Feature_ID",
10745        #     }
10746        # ],
10747
10748        # Init
10749        if temporary_tables is None:
10750            temporary_tables = []
10751        if annotation_fields is None:
10752            annotation_fields = []
10753
10754        for column_format in column_formats:
10755
10756            # annotation field and transcript annotation field
10757            annotation_field = column_format.get("transcripts_column", "ANN")
10758            transcript_annotation = column_format.get(
10759                "transcripts_infos_column", "Feature_ID"
10760            )
10761
10762            # Transcripts infos columns rename
10763            column_rename = column_format.get("column_rename", column_rename)
10764
10765            # Transcripts infos columns clean
10766            column_clean = column_format.get("column_clean", column_clean)
10767
10768            # Transcripts infos columns case
10769            column_case = column_format.get("column_case", column_case)
10770
10771            # Temporary View name
10772            temporary_view_name = transcripts_table + "".join(
10773                random.choices(string.ascii_uppercase + string.digits, k=10)
10774            )
10775
10776            # Create temporary view name
10777            temporary_view_name = self.annotation_format_to_table(
10778                uniquify=True,
10779                annotation_field=annotation_field,
10780                view_name=temporary_view_name,
10781                annotation_id=transcript_annotation,
10782                column_rename=column_rename,
10783                column_clean=column_clean,
10784                column_case=column_case,
10785            )
10786
10787            # Annotation fields
10788            if temporary_view_name:
10789                query_annotation_fields = f"""
10790                    SELECT *
10791                    FROM (
10792                        DESCRIBE SELECT *
10793                        FROM {temporary_view_name}
10794                        )
10795                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10796                """
10797                df_annotation_fields = self.get_query_to_df(
10798                    query=query_annotation_fields
10799                )
10800
10801                # Add temporary view and annotation fields
10802                temporary_tables.append(temporary_view_name)
10803                annotation_fields += list(set(df_annotation_fields["column_name"]))
10804
10805        return temporary_tables, annotation_fields
10806
10807    def create_transcript_view(
10808        self,
10809        transcripts_table: str = None,
10810        transcripts_table_drop: bool = False,
10811        param: dict = {},
10812    ) -> str:
10813        """
10814        The `create_transcript_view` function generates a transcript view by processing data from a
10815        specified table based on provided parameters and structural information.
10816
10817        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10818        is used to specify the name of the table that will store the final transcript view data. If a table
10819        name is not provided, the function will create a new table to store the transcript view data, and by
10820        default,, defaults to transcripts
10821        :type transcripts_table: str (optional)
10822        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10823        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10824        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10825        the function will drop the existing transcripts table if it exists, defaults to False
10826        :type transcripts_table_drop: bool (optional)
10827        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10828        contains information needed to create a transcript view. It includes details such as the structure
10829        of the transcripts, columns mapping, column formats, and other necessary information for generating
10830        the view. This parameter allows for flexibility and customization
10831        :type param: dict
10832        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10833        created or modified during the execution of the function.
10834        """
10835
10836        log.debug("Start transcripts view creation...")
10837
10838        # Default
10839        transcripts_table_default = "transcripts"
10840
10841        # Param
10842        if not param:
10843            param = self.get_param()
10844
10845        # Struct
10846        struct = param.get("transcripts", {}).get("struct", None)
10847
10848        # Transcript veresion
10849        transcript_id_remove_version = param.get("transcripts", {}).get(
10850            "transcript_id_remove_version", False
10851        )
10852
10853        # Transcripts mapping
10854        transcript_id_mapping_file = param.get("transcripts", {}).get(
10855            "transcript_id_mapping_file", None
10856        )
10857
10858        # Transcripts mapping
10859        transcript_id_mapping_force = param.get("transcripts", {}).get(
10860            "transcript_id_mapping_force", None
10861        )
10862
10863        if struct:
10864
10865            # Transcripts table
10866            if transcripts_table is None:
10867                transcripts_table = param.get("transcripts", {}).get(
10868                    "table", transcripts_table_default
10869                )
10870
10871            # added_columns
10872            added_columns = []
10873
10874            # Temporary tables
10875            temporary_tables = []
10876
10877            # Annotation fields
10878            annotation_fields = []
10879
10880            # from columns map
10881            columns_maps = struct.get("from_columns_map", [])
10882            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10883                self.create_transcript_view_from_columns_map(
10884                    transcripts_table=transcripts_table,
10885                    columns_maps=columns_maps,
10886                    added_columns=added_columns,
10887                    temporary_tables=temporary_tables,
10888                    annotation_fields=annotation_fields,
10889                )
10890            )
10891            added_columns += added_columns_tmp
10892            temporary_tables += temporary_tables_tmp
10893            annotation_fields += annotation_fields_tmp
10894
10895            # from column format
10896            column_formats = struct.get("from_column_format", [])
10897            temporary_tables_tmp, annotation_fields_tmp = (
10898                self.create_transcript_view_from_column_format(
10899                    transcripts_table=transcripts_table,
10900                    column_formats=column_formats,
10901                    temporary_tables=temporary_tables,
10902                    annotation_fields=annotation_fields,
10903                )
10904            )
10905            temporary_tables += temporary_tables_tmp
10906            annotation_fields += annotation_fields_tmp
10907
10908            # Remove some specific fields/column
10909            annotation_fields = list(set(annotation_fields))
10910            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10911                if field in annotation_fields:
10912                    annotation_fields.remove(field)
10913
10914            # Merge temporary tables query
10915            query_merge = ""
10916            for temporary_table in list(set(temporary_tables)):
10917
10918                # First temporary table
10919                if not query_merge:
10920                    query_merge = f"""
10921                        SELECT * FROM {temporary_table}
10922                    """
10923                # other temporary table (using UNION)
10924                else:
10925                    query_merge += f"""
10926                        UNION BY NAME SELECT * FROM {temporary_table}
10927                    """
10928
10929            # transcript table tmp
10930            transcript_table_tmp = "transcripts_tmp"
10931            transcript_table_tmp2 = "transcripts_tmp2"
10932            transcript_table_tmp3 = "transcripts_tmp3"
10933
10934            # Merge on transcript
10935            query_merge_on_transcripts_annotation_fields = []
10936
10937            # Add transcript list
10938            query_merge_on_transcripts_annotation_fields.append(
10939                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10940            )
10941
10942            # Aggregate all annotations fields
10943            for annotation_field in set(annotation_fields):
10944                query_merge_on_transcripts_annotation_fields.append(
10945                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10946                )
10947
10948            # Transcripts mapping
10949            if transcript_id_mapping_file:
10950
10951                # Transcript dataframe
10952                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10953                transcript_id_mapping_dataframe = transcripts_file_to_df(
10954                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10955                )
10956
10957                # Transcript version remove
10958                if transcript_id_remove_version:
10959                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10960                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10961                    query_left_join = f"""
10962                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10963                    """
10964                else:
10965                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10966                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10967                    query_left_join = f"""
10968                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10969                    """
10970
10971                # Transcript column for group by merge
10972                query_transcript_merge_group_by = """
10973                        CASE
10974                            WHEN transcript_mapped NOT IN ('')
10975                            THEN split_part(transcript_mapped, '.', 1)
10976                            ELSE split_part(transcript_original, '.', 1)
10977                        END
10978                    """
10979
10980                # Merge query
10981                transcripts_tmp2_query = f"""
10982                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10983                    FROM ({query_merge}) AS {transcript_table_tmp}
10984                    {query_left_join}
10985                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10986                """
10987
10988                # Retrive columns after mege
10989                transcripts_tmp2_describe_query = f"""
10990                    DESCRIBE {transcripts_tmp2_query}
10991                """
10992                transcripts_tmp2_describe_list = list(
10993                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10994                        "column_name"
10995                    ]
10996                )
10997
10998                # Create list of columns for select clause
10999                transcripts_tmp2_describe_select_clause = []
11000                for field in transcripts_tmp2_describe_list:
11001                    if field not in [
11002                        "#CHROM",
11003                        "POS",
11004                        "REF",
11005                        "ALT",
11006                        "INFO",
11007                        "transcript_mapped",
11008                    ]:
11009                        as_field = field
11010                        if field in ["transcript_original"]:
11011                            as_field = "transcripts_mapped"
11012                        transcripts_tmp2_describe_select_clause.append(
11013                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11014                        )
11015
11016                # Merge with mapping
11017                query_merge_on_transcripts = f"""
11018                    SELECT
11019                        "#CHROM", POS, REF, ALT, INFO,
11020                        CASE
11021                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11022                            THEN ANY_VALUE(transcript_mapped)
11023                            ELSE ANY_VALUE(transcript_original)
11024                        END AS transcript,
11025                        {", ".join(transcripts_tmp2_describe_select_clause)}
11026                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11027                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11028                        {query_transcript_merge_group_by}
11029                """
11030
11031                # Add transcript filter from mapping file
11032                if transcript_id_mapping_force:
11033                    query_merge_on_transcripts = f"""
11034                        SELECT *
11035                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11036                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11037                    """
11038
11039            # No transcript mapping
11040            else:
11041
11042                # Remove transcript version
11043                if transcript_id_remove_version:
11044                    query_transcript_column = f"""
11045                        split_part({transcript_table_tmp}.transcript, '.', 1)
11046                    """
11047                else:
11048                    query_transcript_column = """
11049                        transcript
11050                    """
11051
11052                # Query sections
11053                query_transcript_column_select = (
11054                    f"{query_transcript_column} AS transcript"
11055                )
11056                query_transcript_column_group_by = query_transcript_column
11057
11058                # Query for transcripts view
11059                query_merge_on_transcripts = f"""
11060                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11061                    FROM ({query_merge}) AS {transcript_table_tmp}
11062                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11063                """
11064
11065            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11066
11067            # Drop transcript view is necessary
11068            if transcripts_table_drop:
11069                query_drop = f"""
11070                    DROP TABLE IF EXISTS {transcripts_table};
11071                """
11072                self.execute_query(query=query_drop)
11073
11074            # Merge and create transcript view
11075            query_create_view = f"""
11076                CREATE TABLE IF NOT EXISTS {transcripts_table}
11077                AS {query_merge_on_transcripts}
11078            """
11079            self.execute_query(query=query_create_view)
11080
11081            # Remove added columns
11082            for added_column in added_columns:
11083                self.drop_column(column=added_column)
11084
11085        else:
11086
11087            transcripts_table = None
11088
11089        return transcripts_table
11090
11091    def annotation_format_to_table(
11092        self,
11093        uniquify: bool = True,
11094        annotation_field: str = "ANN",
11095        annotation_id: str = "Feature_ID",
11096        view_name: str = "transcripts",
11097        column_rename: dict = {},
11098        column_clean: bool = False,
11099        column_case: str = None,
11100    ) -> str:
11101        """
11102        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11103        structured table format, ensuring unique values and creating a temporary table for further
11104        processing or analysis.
11105
11106        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11107        unique values in the output or not. If set to `True`, the function will make sure that the
11108        output values are unique, defaults to True
11109        :type uniquify: bool (optional)
11110        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11111        that contains the annotation information for each variant. This field is used to extract the
11112        annotation details for further processing in the function. By default, it is set to "ANN",
11113        defaults to ANN
11114        :type annotation_field: str (optional)
11115        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11116        is used to specify the identifier for the annotation feature. This identifier will be used as a
11117        column name in the resulting table or view that is created based on the annotation data. It
11118        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11119        :type annotation_id: str (optional)
11120        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11121        to specify the name of the temporary table that will be created to store the transformed
11122        annotation data. This table will hold the extracted information from the annotation field in a
11123        structured format for further processing or analysis. By default,, defaults to transcripts
11124        :type view_name: str (optional)
11125        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11126        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11127        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11128        created based on the annotation data. This feature enables
11129        :type column_rename: dict
11130        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11131        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11132        If set to `True`, the function will clean the annotation field before further processing. This
11133        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11134        to False
11135        :type column_clean: bool (optional)
11136        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11137        used to specify the case transformation to be applied to the column names extracted from the
11138        annotation data. It allows you to set the case of the column names to either lowercase or
11139        uppercase for consistency or other specific requirements during the conversion
11140        :type column_case: str
11141        :return: The function `annotation_format_to_table` is returning the name of the view created,
11142        which is stored in the variable `view_name`.
11143        """
11144
11145        # Annotation field
11146        annotation_format = "annotation_explode"
11147
11148        # Transcript annotation
11149        if column_rename:
11150            annotation_id = column_rename.get(annotation_id, annotation_id)
11151
11152        if column_clean:
11153            annotation_id = clean_annotation_field(annotation_id)
11154
11155        # Prefix
11156        prefix = self.get_explode_infos_prefix()
11157        if prefix:
11158            prefix = "INFO/"
11159
11160        # Annotation fields
11161        annotation_infos = prefix + annotation_field
11162        annotation_format_infos = prefix + annotation_format
11163
11164        # Variants table
11165        table_variants = self.get_table_variants()
11166
11167        # Header
11168        vcf_reader = self.get_header()
11169
11170        # Add columns
11171        added_columns = []
11172
11173        # Explode HGVS field in column
11174        added_columns += self.explode_infos(fields=[annotation_field])
11175
11176        if annotation_field in vcf_reader.infos:
11177
11178            # Extract ANN header
11179            ann_description = vcf_reader.infos[annotation_field].desc
11180            pattern = r"'(.+?)'"
11181            match = re.search(pattern, ann_description)
11182            if match:
11183                ann_header_match = match.group(1).split(" | ")
11184                ann_header = []
11185                ann_header_desc = {}
11186                for i in range(len(ann_header_match)):
11187                    ann_header_info = "".join(
11188                        char for char in ann_header_match[i] if char.isalnum()
11189                    )
11190                    ann_header.append(ann_header_info)
11191                    ann_header_desc[ann_header_info] = ann_header_match[i]
11192                if not ann_header_desc:
11193                    raise ValueError("Invalid header description format")
11194            else:
11195                raise ValueError("Invalid header description format")
11196
11197            # Create variant id
11198            variant_id_column = self.get_variant_id_column()
11199            added_columns += [variant_id_column]
11200
11201            # Create dataframe
11202            dataframe_annotation_format = self.get_query_to_df(
11203                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11204            )
11205
11206            # Create annotation columns
11207            dataframe_annotation_format[
11208                annotation_format_infos
11209            ] = dataframe_annotation_format[annotation_infos].apply(
11210                lambda x: explode_annotation_format(
11211                    annotation=str(x),
11212                    uniquify=uniquify,
11213                    output_format="JSON",
11214                    prefix="",
11215                    header=list(ann_header_desc.values()),
11216                )
11217            )
11218
11219            # Find keys
11220            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11221            df_keys = self.get_query_to_df(query=query_json)
11222
11223            # Check keys
11224            query_json_key = []
11225            for _, row in df_keys.iterrows():
11226
11227                # Key
11228                key = row.iloc[0]
11229                key_clean = key
11230
11231                # key rename
11232                if column_rename:
11233                    key_clean = column_rename.get(key_clean, key_clean)
11234
11235                # key clean
11236                if column_clean:
11237                    key_clean = clean_annotation_field(key_clean)
11238
11239                # Key case
11240                if column_case:
11241                    if column_case.lower() in ["lower"]:
11242                        key_clean = key_clean.lower()
11243                    elif column_case.lower() in ["upper"]:
11244                        key_clean = key_clean.upper()
11245
11246                # Type
11247                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11248
11249                # Get DataFrame from query
11250                df_json_type = self.get_query_to_df(query=query_json_type)
11251
11252                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11253                with pd.option_context("future.no_silent_downcasting", True):
11254                    df_json_type.fillna(value="", inplace=True)
11255                    replace_dict = {None: np.nan, "": np.nan}
11256                    df_json_type.replace(replace_dict, inplace=True)
11257                    df_json_type.dropna(inplace=True)
11258
11259                # Detect column type
11260                column_type = detect_column_type(df_json_type[key_clean])
11261
11262                # Append
11263                query_json_key.append(
11264                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11265                )
11266
11267            # Create view
11268            query_view = f"""
11269                CREATE TEMPORARY TABLE {view_name}
11270                AS (
11271                    SELECT *, {annotation_id} AS 'transcript'
11272                    FROM (
11273                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11274                        FROM dataframe_annotation_format
11275                        )
11276                    );
11277            """
11278            self.execute_query(query=query_view)
11279
11280        else:
11281
11282            # Return None
11283            view_name = None
11284
11285        # Remove added columns
11286        for added_column in added_columns:
11287            self.drop_column(column=added_column)
11288
11289        return view_name
11290
11291    def transcript_view_to_variants(
11292        self,
11293        transcripts_table: str = None,
11294        transcripts_column_id: str = None,
11295        transcripts_info_json: str = None,
11296        transcripts_info_field_json: str = None,
11297        transcripts_info_format: str = None,
11298        transcripts_info_field_format: str = None,
11299        param: dict = {},
11300    ) -> bool:
11301        """
11302        The `transcript_view_to_variants` function updates a variants table with information from
11303        transcripts in JSON format.
11304
11305        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11306        table containing the transcripts data. If this parameter is not provided, the function will
11307        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11308        :type transcripts_table: str
11309        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11310        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11311        identifier is used to match transcripts with variants in the database
11312        :type transcripts_column_id: str
11313        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11314        of the column in the variants table where the transcripts information will be stored in JSON
11315        format. This parameter allows you to define the column in the variants table that will hold the
11316        JSON-formatted information about transcripts
11317        :type transcripts_info_json: str
11318        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11319        specify the field in the VCF header that will contain information about transcripts in JSON
11320        format. This field will be added to the VCF header as an INFO field with the specified name
11321        :type transcripts_info_field_json: str
11322        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11323        format of the information about transcripts that will be stored in the variants table. This
11324        format can be used to define how the transcript information will be structured or displayed
11325        within the variants table
11326        :type transcripts_info_format: str
11327        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11328        specify the field in the VCF header that will contain information about transcripts in a
11329        specific format. This field will be added to the VCF header as an INFO field with the specified
11330        name
11331        :type transcripts_info_field_format: str
11332        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11333        that contains various configuration settings related to transcripts. It is used to provide
11334        default values for certain parameters if they are not explicitly provided when calling the
11335        method. The `param` dictionary can be passed as an argument
11336        :type param: dict
11337        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11338        if the operation is successful and `False` if certain conditions are not met.
11339        """
11340
11341        msg_info_prefix = "Start transcripts view to variants annotations"
11342
11343        log.debug(f"{msg_info_prefix}...")
11344
11345        # Default
11346        transcripts_table_default = "transcripts"
11347        transcripts_column_id_default = "transcript"
11348        transcripts_info_json_default = None
11349        transcripts_info_format_default = None
11350        transcripts_info_field_json_default = None
11351        transcripts_info_field_format_default = None
11352
11353        # Param
11354        if not param:
11355            param = self.get_param()
11356
11357        # Transcripts table
11358        if transcripts_table is None:
11359            transcripts_table = param.get("transcripts", {}).get(
11360                "table", transcripts_table_default
11361            )
11362
11363        # Transcripts column ID
11364        if transcripts_column_id is None:
11365            transcripts_column_id = param.get("transcripts", {}).get(
11366                "column_id", transcripts_column_id_default
11367            )
11368
11369        # Transcripts info json
11370        if transcripts_info_json is None:
11371            transcripts_info_json = param.get("transcripts", {}).get(
11372                "transcripts_info_json", transcripts_info_json_default
11373            )
11374
11375        # Transcripts info field JSON
11376        if transcripts_info_field_json is None:
11377            transcripts_info_field_json = param.get("transcripts", {}).get(
11378                "transcripts_info_field_json", transcripts_info_field_json_default
11379            )
11380        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11381        #     transcripts_info_json = transcripts_info_field_json
11382
11383        # Transcripts info format
11384        if transcripts_info_format is None:
11385            transcripts_info_format = param.get("transcripts", {}).get(
11386                "transcripts_info_format", transcripts_info_format_default
11387            )
11388
11389        # Transcripts info field FORMAT
11390        if transcripts_info_field_format is None:
11391            transcripts_info_field_format = param.get("transcripts", {}).get(
11392                "transcripts_info_field_format", transcripts_info_field_format_default
11393            )
11394        # if (
11395        #     transcripts_info_field_format is not None
11396        #     and transcripts_info_format is None
11397        # ):
11398        #     transcripts_info_format = transcripts_info_field_format
11399
11400        # Variants table
11401        table_variants = self.get_table_variants()
11402
11403        # Check info columns param
11404        if (
11405            transcripts_info_json is None
11406            and transcripts_info_field_json is None
11407            and transcripts_info_format is None
11408            and transcripts_info_field_format is None
11409        ):
11410            return False
11411
11412        # Transcripts infos columns
11413        query_transcripts_infos_columns = f"""
11414            SELECT *
11415            FROM (
11416                DESCRIBE SELECT * FROM {transcripts_table}
11417                )
11418            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11419        """
11420        transcripts_infos_columns = list(
11421            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11422        )
11423
11424        # View results
11425        clause_select = []
11426        clause_to_json = []
11427        clause_to_format = []
11428        for field in transcripts_infos_columns:
11429            # Do not consider INFO field for export into fields
11430            if field not in ["INFO"]:
11431                clause_select.append(
11432                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11433                )
11434                clause_to_json.append(f""" '{field}': "{field}" """)
11435                clause_to_format.append(f""" "{field}" """)
11436
11437        # Update
11438        update_set_json = []
11439        update_set_format = []
11440
11441        # VCF header
11442        vcf_reader = self.get_header()
11443
11444        # Transcripts to info column in JSON
11445        if transcripts_info_json:
11446
11447            # Create column on variants table
11448            self.add_column(
11449                table_name=table_variants,
11450                column_name=transcripts_info_json,
11451                column_type="JSON",
11452                default_value=None,
11453                drop=False,
11454            )
11455
11456            # Add header
11457            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11458                transcripts_info_json,
11459                ".",
11460                "String",
11461                "Transcripts in JSON format",
11462                "unknwon",
11463                "unknwon",
11464                self.code_type_map["String"],
11465            )
11466
11467            # Add to update
11468            update_set_json.append(
11469                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11470            )
11471
11472        # Transcripts to info field in JSON
11473        if transcripts_info_field_json:
11474
11475            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11476
11477            # Add to update
11478            update_set_json.append(
11479                f""" 
11480                    INFO = concat(
11481                            CASE
11482                                WHEN INFO NOT IN ('', '.')
11483                                THEN INFO
11484                                ELSE ''
11485                            END,
11486                            CASE
11487                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11488                                THEN concat(
11489                                    ';{transcripts_info_field_json}=',
11490                                    t.{transcripts_info_json}
11491                                )
11492                                ELSE ''
11493                            END
11494                            )
11495                """
11496            )
11497
11498            # Add header
11499            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11500                transcripts_info_field_json,
11501                ".",
11502                "String",
11503                "Transcripts in JSON format",
11504                "unknwon",
11505                "unknwon",
11506                self.code_type_map["String"],
11507            )
11508
11509        if update_set_json:
11510
11511            # Update query
11512            query_update = f"""
11513                UPDATE {table_variants}
11514                    SET {", ".join(update_set_json)}
11515                FROM
11516                (
11517                    SELECT
11518                        "#CHROM", POS, REF, ALT,
11519                            concat(
11520                            '{{',
11521                            string_agg(
11522                                '"' || "{transcripts_column_id}" || '":' ||
11523                                to_json(json_output)
11524                            ),
11525                            '}}'
11526                            )::JSON AS {transcripts_info_json}
11527                    FROM
11528                        (
11529                        SELECT
11530                            "#CHROM", POS, REF, ALT,
11531                            "{transcripts_column_id}",
11532                            to_json(
11533                                {{{",".join(clause_to_json)}}}
11534                            )::JSON AS json_output
11535                        FROM
11536                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11537                        WHERE "{transcripts_column_id}" IS NOT NULL
11538                        )
11539                    GROUP BY "#CHROM", POS, REF, ALT
11540                ) AS t
11541                WHERE {table_variants}."#CHROM" = t."#CHROM"
11542                    AND {table_variants}."POS" = t."POS"
11543                    AND {table_variants}."REF" = t."REF"
11544                    AND {table_variants}."ALT" = t."ALT"
11545            """
11546
11547            self.execute_query(query=query_update)
11548
11549        # Transcripts to info column in FORMAT
11550        if transcripts_info_format:
11551
11552            # Create column on variants table
11553            self.add_column(
11554                table_name=table_variants,
11555                column_name=transcripts_info_format,
11556                column_type="VARCHAR",
11557                default_value=None,
11558                drop=False,
11559            )
11560
11561            # Add header
11562            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11563                transcripts_info_format,
11564                ".",
11565                "String",
11566                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11567                "unknwon",
11568                "unknwon",
11569                self.code_type_map["String"],
11570            )
11571
11572            # Add to update
11573            update_set_format.append(
11574                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11575            )
11576
11577        else:
11578
11579            # Set variable for internal queries
11580            transcripts_info_format = "transcripts_info_format"
11581
11582        # Transcripts to info field in JSON
11583        if transcripts_info_field_format:
11584
11585            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11586
11587            # Add to update
11588            update_set_format.append(
11589                f""" 
11590                    INFO = concat(
11591                            CASE
11592                                WHEN INFO NOT IN ('', '.')
11593                                THEN INFO
11594                                ELSE ''
11595                            END,
11596                            CASE
11597                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11598                                THEN concat(
11599                                    ';{transcripts_info_field_format}=',
11600                                    t.{transcripts_info_format}
11601                                )
11602                                ELSE ''
11603                            END
11604                            )
11605                """
11606            )
11607
11608            # Add header
11609            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11610                transcripts_info_field_format,
11611                ".",
11612                "String",
11613                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11614                "unknwon",
11615                "unknwon",
11616                self.code_type_map["String"],
11617            )
11618
11619        if update_set_format:
11620
11621            # Update query
11622            query_update = f"""
11623                UPDATE {table_variants}
11624                    SET {", ".join(update_set_format)}
11625                FROM
11626                (
11627                    SELECT
11628                        "#CHROM", POS, REF, ALT,
11629                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11630                    FROM 
11631                        (
11632                        SELECT
11633                            "#CHROM", POS, REF, ALT,
11634                            "{transcripts_column_id}",
11635                            concat(
11636                                "{transcripts_column_id}",
11637                                '|',
11638                                {", '|', ".join(clause_to_format)}
11639                            ) AS {transcripts_info_format}
11640                        FROM
11641                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11642                        )
11643                    GROUP BY "#CHROM", POS, REF, ALT
11644                ) AS t
11645                WHERE {table_variants}."#CHROM" = t."#CHROM"
11646                    AND {table_variants}."POS" = t."POS"
11647                    AND {table_variants}."REF" = t."REF"
11648                    AND {table_variants}."ALT" = t."ALT"
11649            """
11650
11651            self.execute_query(query=query_update)
11652
11653        return True
11654
11655    def rename_info_fields(
11656        self, fields_to_rename: dict = None, table: str = None
11657    ) -> dict:
11658        """
11659        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11660        corresponding INFO fields in the variants table.
11661
11662        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11663        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11664        represent the original field names that need to be renamed, and the corresponding values
11665        represent the new names to which the fields should be
11666        :type fields_to_rename: dict
11667        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11668        the table in which the variants data is stored. This table contains information about genetic
11669        variants, and the function updates the corresponding INFO fields in this table when renaming
11670        specified fields in the VCF file header
11671        :type table: str
11672        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11673        the original field names as keys and their corresponding new names (or None if the field was
11674        removed) as values after renaming or removing specified fields in a VCF file header and updating
11675        corresponding INFO fields in the variants table.
11676        """
11677
11678        # Init
11679        fields_renamed = {}
11680        config = self.get_config()
11681        access = config.get("access")
11682
11683        if table is None:
11684            table = self.get_table_variants()
11685
11686        # regexp replace fonction
11687        regex_replace_dict = {}
11688        regex_replace_nb = 0
11689        regex_replace_partition = 125
11690        regex_replace = "INFO"
11691
11692        if fields_to_rename is not None and access not in ["RO"]:
11693
11694            log.info("Rename or remove fields...")
11695
11696            # Header
11697            header = self.get_header()
11698
11699            for field_to_rename, field_renamed in fields_to_rename.items():
11700
11701                if field_to_rename in header.infos:
11702
11703                    # Rename header
11704                    if field_renamed is not None:
11705                        header.infos[field_renamed] = vcf.parser._Info(
11706                            field_renamed,
11707                            header.infos[field_to_rename].num,
11708                            header.infos[field_to_rename].type,
11709                            header.infos[field_to_rename].desc,
11710                            header.infos[field_to_rename].source,
11711                            header.infos[field_to_rename].version,
11712                            header.infos[field_to_rename].type_code,
11713                        )
11714                    del header.infos[field_to_rename]
11715
11716                    # Rename INFO patterns
11717                    field_pattern = rf'(^|;)({field_to_rename})($|;|=[^;]*)'
11718                    if field_renamed is not None:
11719                        field_renamed_pattern = rf'\1{field_renamed}\3'
11720                    else:
11721                        field_renamed_pattern = ''
11722
11723                    # regexp replace
11724                    regex_replace_nb += 1
11725                    regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition)
11726                    if (regex_replace_nb % regex_replace_partition) == 0:
11727                        regex_replace = "INFO"
11728                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11729                    regex_replace_dict[regex_replace_key] = regex_replace
11730
11731                    # Return
11732                    fields_renamed[field_to_rename] = field_renamed
11733
11734                    # Log
11735                    if field_renamed is not None:
11736                        log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'")
11737                    else:
11738                        log.info(f"Rename or remove fields - field '{field_to_rename}' removed")
11739
11740                else:
11741
11742                    log.warning(f"Rename or remove fields - field '{field_to_rename}' not in header")
11743
11744
11745            # Rename INFO
11746            for regex_replace_key, regex_replace  in regex_replace_dict.items():
11747                log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...")
11748                query = f"""
11749                    UPDATE {table}
11750                    SET
11751                        INFO = {regex_replace}
11752                """
11753                log.debug(f"query={query}")
11754                self.execute_query(query=query)
11755
11756        return fields_renamed
11757
11758    def calculation_rename_info_fields(
11759        self,
11760        fields_to_rename: dict = None,
11761        table: str = None,
11762        operation_name: str = "RENAME_INFO_FIELDS",
11763    ) -> None:
11764        """
11765        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11766        fields to rename and table if provided, and then calls another function to rename the fields.
11767
11768        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11769        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11770        the key and the new field name as the value
11771        :type fields_to_rename: dict
11772        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11773        specify the name of the table for which the fields are to be renamed. It is a string type
11774        parameter
11775        :type table: str
11776        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11777        method is a string that specifies the name of the operation being performed. In this context, it
11778        is used as a default value for the operation name if not explicitly provided when calling the
11779        function, defaults to RENAME_INFO_FIELDS
11780        :type operation_name: str (optional)
11781        """
11782
11783        # Param
11784        param = self.get_param()
11785
11786        # Get param fields to rename
11787        param_fields_to_rename = (
11788            param.get("calculation", {})
11789            .get("calculations", {})
11790            .get(operation_name, {})
11791            .get("fields_to_rename", None)
11792        )
11793
11794        # Get param table
11795        param_table = (
11796            param.get("calculation", {})
11797            .get("calculations", {})
11798            .get(operation_name, {})
11799            .get("table", None)
11800        )
11801
11802        # Init fields_to_rename
11803        if fields_to_rename is None:
11804            fields_to_rename = param_fields_to_rename
11805
11806        # Init table
11807        if table is None:
11808            table = param_table
11809
11810        renamed_fields = self.rename_info_fields(
11811            fields_to_rename=fields_to_rename, table=table
11812        )
11813
11814        log.debug(f"renamed_fields:{renamed_fields}")
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
39    def __init__(
40        self,
41        conn=None,
42        input: str = None,
43        output: str = None,
44        config: dict = {},
45        param: dict = {},
46        load: bool = False,
47    ) -> None:
48        """
49        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
50        header
51
52        :param conn: the connection to the database
53        :param input: the input file
54        :param output: the output file
55        :param config: a dictionary containing the configuration of the model
56        :param param: a dictionary containing the parameters of the model
57        """
58
59        # Init variables
60        self.init_variables()
61
62        # Input
63        self.set_input(input)
64
65        # Config
66        self.set_config(config)
67
68        # Param
69        self.set_param(param)
70
71        # Output
72        self.set_output(output)
73
74        # connexion
75        self.set_connexion(conn)
76
77        # Header
78        self.set_header()
79
80        # Samples
81        self.set_samples()
82
83        # Load data
84        if load:
85            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_samples(self, samples: list = None) -> list:
 87    def set_samples(self, samples: list = None) -> list:
 88        """
 89        The function `set_samples` sets the samples attribute of an object to a provided list or
 90        retrieves it from a parameter dictionary.
 91
 92        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
 93        input and sets the `samples` attribute of the class to the provided list. If no samples are
 94        provided, it tries to get the samples from the class's parameters using the `get_param` method
 95        :type samples: list
 96        :return: The `samples` list is being returned.
 97        """
 98
 99        if not samples:
100            samples = self.get_param().get("samples", {}).get("list", None)
101
102        self.samples = samples
103
104        return samples

The function set_samples sets the samples attribute of an object to a provided list or retrieves it from a parameter dictionary.

Parameters
  • samples: The set_samples method is a method of a class that takes a list of samples as input and sets the samples attribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using the get_param method
Returns

The samples list is being returned.

def get_samples(self) -> list:
106    def get_samples(self) -> list:
107        """
108        This function returns a list of samples.
109        :return: The `get_samples` method is returning the `samples` attribute of the object.
110        """
111
112        return self.samples

This function returns a list of samples.

Returns

The get_samples method is returning the samples attribute of the object.

def get_samples_check(self) -> bool:
114    def get_samples_check(self) -> bool:
115        """
116        This function returns the value of the "check" key within the "samples" dictionary retrieved
117        from the parameters.
118        :return: The method `get_samples_check` is returning the value of the key "check" inside the
119        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
120        method. If the key "check" is not found, it will return `False`.
121        """
122
123        return self.get_param().get("samples", {}).get("check", True)

This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.

Returns

The method get_samples_check is returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by the get_param() method. If the key "check" is not found, it will return False.

def set_input(self, input: str = None) -> None:
125    def set_input(self, input: str = None) -> None:
126        """
127        The function `set_input` takes a file name as input, extracts the name and extension, and sets
128        attributes in the class accordingly.
129
130        :param input: The `set_input` method in the provided code snippet is used to set attributes
131        related to the input file. Here's a breakdown of the parameters and their usage in the method:
132        :type input: str
133        """
134
135        if input and not isinstance(input, str):
136            try:
137                self.input = input.name
138            except:
139                log.error(f"Input file '{input} in bad format")
140                raise ValueError(f"Input file '{input} in bad format")
141        else:
142            self.input = input
143
144        # Input format
145        if input:
146            input_name, input_extension = os.path.splitext(self.input)
147            self.input_name = input_name
148            self.input_extension = input_extension
149            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
151    def set_config(self, config: dict) -> None:
152        """
153        The set_config function takes a config object and assigns it as the configuration object for the
154        class.
155
156        :param config: The `config` parameter in the `set_config` function is a dictionary object that
157        contains configuration settings for the class. When you call the `set_config` function with a
158        dictionary object as the argument, it will set that dictionary as the configuration object for
159        the class
160        :type config: dict
161        """
162
163        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
165    def set_param(self, param: dict) -> None:
166        """
167        This function sets a parameter object for the class based on the input dictionary.
168
169        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
170        as the `param` attribute of the class instance
171        :type param: dict
172        """
173
174        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
176    def init_variables(self) -> None:
177        """
178        This function initializes the variables that will be used in the rest of the class
179        """
180
181        self.prefix = "howard"
182        self.table_variants = "variants"
183        self.dataframe = None
184
185        self.comparison_map = {
186            "gt": ">",
187            "gte": ">=",
188            "lt": "<",
189            "lte": "<=",
190            "equals": "=",
191            "contains": "SIMILAR TO",
192        }
193
194        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
195
196        self.code_type_map_to_sql = {
197            "Integer": "INTEGER",
198            "String": "VARCHAR",
199            "Float": "FLOAT",
200            "Flag": "VARCHAR",
201        }
202
203        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
205    def get_indexing(self) -> bool:
206        """
207        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
208        returns False.
209        :return: The value of the indexing parameter.
210        """
211
212        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
214    def get_connexion_config(self) -> dict:
215        """
216        The function `get_connexion_config` returns a dictionary containing the configuration for a
217        connection, including the number of threads and memory limit.
218        :return: a dictionary containing the configuration for the Connexion library.
219        """
220
221        # config
222        config = self.get_config()
223
224        # Connexion config
225        connexion_config = {}
226        threads = self.get_threads()
227
228        # Threads
229        if threads:
230            connexion_config["threads"] = threads
231
232        # Memory
233        # if config.get("memory", None):
234        #     connexion_config["memory_limit"] = config.get("memory")
235        if self.get_memory():
236            connexion_config["memory_limit"] = self.get_memory()
237
238        # Temporary directory
239        if config.get("tmp", None):
240            connexion_config["temp_directory"] = config.get("tmp")
241
242        # Access
243        if config.get("access", None):
244            access = config.get("access")
245            if access in ["RO"]:
246                access = "READ_ONLY"
247            elif access in ["RW"]:
248                access = "READ_WRITE"
249            connexion_db = self.get_connexion_db()
250            if connexion_db in ":memory:":
251                access = "READ_WRITE"
252            connexion_config["access_mode"] = access
253
254        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
256    def get_duckdb_settings(self) -> dict:
257        """
258        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
259        string.
260        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
261        """
262
263        # config
264        config = self.get_config()
265
266        # duckdb settings
267        duckdb_settings_dict = {}
268        if config.get("duckdb_settings", None):
269            duckdb_settings = config.get("duckdb_settings")
270            duckdb_settings = full_path(duckdb_settings)
271            # duckdb setting is a file
272            if os.path.exists(duckdb_settings):
273                with open(duckdb_settings) as json_file:
274                    duckdb_settings_dict = yaml.safe_load(json_file)
275            # duckdb settings is a string
276            else:
277                duckdb_settings_dict = json.loads(duckdb_settings)
278
279        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
281    def set_connexion_db(self) -> str:
282        """
283        The function `set_connexion_db` returns the appropriate database connection string based on the
284        input format and connection type.
285        :return: the value of the variable `connexion_db`.
286        """
287
288        # Default connexion db
289        default_connexion_db = ":memory:"
290
291        # Find connexion db
292        if self.get_input_format() in ["db", "duckdb"]:
293            connexion_db = self.get_input()
294        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
295            connexion_db = default_connexion_db
296        elif self.get_connexion_type() in ["tmpfile"]:
297            tmp_name = tempfile.mkdtemp(
298                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
299            )
300            connexion_db = f"{tmp_name}/tmp.db"
301        elif self.get_connexion_type() != "":
302            connexion_db = self.get_connexion_type()
303        else:
304            connexion_db = default_connexion_db
305
306        # Set connexion db
307        self.connexion_db = connexion_db
308
309        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
311    def set_connexion(self, conn) -> None:
312        """
313        The function `set_connexion` creates a connection to a database, with options for different
314        database formats and settings.
315
316        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
317        database. If a connection is not provided, a new connection to an in-memory database is created.
318        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
319        sqlite
320        """
321
322        # Connexion db
323        connexion_db = self.set_connexion_db()
324
325        # Connexion config
326        connexion_config = self.get_connexion_config()
327
328        # Connexion format
329        connexion_format = self.get_config().get("connexion_format", "duckdb")
330        # Set connexion format
331        self.connexion_format = connexion_format
332
333        # Connexion
334        if not conn:
335            if connexion_format in ["duckdb"]:
336                conn = duckdb.connect(connexion_db, config=connexion_config)
337                # duckDB settings
338                duckdb_settings = self.get_duckdb_settings()
339                if duckdb_settings:
340                    for setting in duckdb_settings:
341                        setting_value = duckdb_settings.get(setting)
342                        if isinstance(setting_value, str):
343                            setting_value = f"'{setting_value}'"
344                        conn.execute(f"PRAGMA {setting}={setting_value};")
345            elif connexion_format in ["sqlite"]:
346                conn = sqlite3.connect(connexion_db)
347
348        # Set connexion
349        self.conn = conn
350
351        # Log
352        log.debug(f"connexion_format: {connexion_format}")
353        log.debug(f"connexion_db: {connexion_db}")
354        log.debug(f"connexion config: {connexion_config}")
355        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
357    def set_output(self, output: str = None) -> None:
358        """
359        The `set_output` function in Python sets the output file based on the input or a specified key
360        in the config file, extracting the output name, extension, and format.
361
362        :param output: The `output` parameter in the `set_output` method is used to specify the name of
363        the output file. If the config file has an 'output' key, the method sets the output to the value
364        of that key. If no output is provided, it sets the output to `None`
365        :type output: str
366        """
367
368        if output and not isinstance(output, str):
369            self.output = output.name
370        else:
371            self.output = output
372
373        # Output format
374        if self.output:
375            output_name, output_extension = os.path.splitext(self.output)
376            self.output_name = output_name
377            self.output_extension = output_extension
378            self.output_format = self.output_extension.replace(".", "")
379        else:
380            self.output_name = None
381            self.output_extension = None
382            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
384    def set_header(self) -> None:
385        """
386        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
387        """
388
389        input_file = self.get_input()
390        default_header_list = [
391            "##fileformat=VCFv4.2",
392            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
393        ]
394
395        # Full path
396        input_file = full_path(input_file)
397
398        if input_file:
399
400            input_format = self.get_input_format()
401            input_compressed = self.get_input_compressed()
402            config = self.get_config()
403            header_list = default_header_list
404            if input_format in [
405                "vcf",
406                "hdr",
407                "tsv",
408                "csv",
409                "psv",
410                "parquet",
411                "db",
412                "duckdb",
413            ]:
414                # header provided in param
415                if config.get("header_file", None):
416                    with open(config.get("header_file"), "rt") as f:
417                        header_list = self.read_vcf_header(f)
418                # within a vcf file format (header within input file itsself)
419                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
420                    # within a compressed vcf file format (.vcf.gz)
421                    if input_compressed:
422                        with bgzf.open(input_file, "rt") as f:
423                            header_list = self.read_vcf_header(f)
424                    # within an uncompressed vcf file format (.vcf)
425                    else:
426                        with open(input_file, "rt") as f:
427                            header_list = self.read_vcf_header(f)
428                # header provided in default external file .hdr
429                elif os.path.exists((input_file + ".hdr")):
430                    with open(input_file + ".hdr", "rt") as f:
431                        header_list = self.read_vcf_header(f)
432                else:
433                    try:  # Try to get header info fields and file columns
434
435                        with tempfile.TemporaryDirectory() as tmpdir:
436
437                            # Create database
438                            db_for_header = Database(database=input_file)
439
440                            # Get header columns for infos fields
441                            db_header_from_columns = (
442                                db_for_header.get_header_from_columns()
443                            )
444
445                            # Get real columns in the file
446                            db_header_columns = db_for_header.get_columns()
447
448                            # Write header file
449                            header_file_tmp = os.path.join(tmpdir, "header")
450                            f = open(header_file_tmp, "w")
451                            vcf.Writer(f, db_header_from_columns)
452                            f.close()
453
454                            # Replace #CHROM line with rel columns
455                            header_list = db_for_header.read_header_file(
456                                header_file=header_file_tmp
457                            )
458                            header_list[-1] = "\t".join(db_header_columns)
459
460                    except:
461
462                        log.warning(
463                            f"No header for file {input_file}. Set as default VCF header"
464                        )
465                        header_list = default_header_list
466
467            else:  # try for unknown format ?
468
469                log.error(f"Input file format '{input_format}' not available")
470                raise ValueError(f"Input file format '{input_format}' not available")
471
472            if not header_list:
473                header_list = default_header_list
474
475            # header as list
476            self.header_list = header_list
477
478            # header as VCF object
479            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
480
481        else:
482
483            self.header_list = None
484            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
486    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
487        """
488        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
489        DataFrame based on the connection format.
490
491        :param query: The `query` parameter in the `get_query_to_df` function is a string that
492        represents the SQL query you want to execute. This query will be used to fetch data from a
493        database and convert it into a pandas DataFrame
494        :type query: str
495        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
496        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
497        function will only fetch up to that number of rows from the database query result. If no limit
498        is specified,
499        :type limit: int
500        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
501        """
502
503        # Connexion format
504        connexion_format = self.get_connexion_format()
505
506        # Limit in query
507        if limit:
508            pd.set_option("display.max_rows", limit)
509            if connexion_format in ["duckdb"]:
510                df = (
511                    self.conn.execute(query)
512                    .fetch_record_batch(limit)
513                    .read_next_batch()
514                    .to_pandas()
515                )
516            elif connexion_format in ["sqlite"]:
517                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
518
519        # Full query
520        else:
521            if connexion_format in ["duckdb"]:
522                df = self.conn.execute(query).df()
523            elif connexion_format in ["sqlite"]:
524                df = pd.read_sql_query(query, self.conn)
525
526        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
528    def get_overview(self) -> None:
529        """
530        The function prints the input, output, config, and dataframe of the current object
531        """
532        table_variants_from = self.get_table_variants(clause="from")
533        sql_columns = self.get_header_columns_as_sql()
534        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
535        df = self.get_query_to_df(sql_query_export)
536        log.info(
537            "Input:  "
538            + str(self.get_input())
539            + " ["
540            + str(str(self.get_input_format()))
541            + "]"
542        )
543        log.info(
544            "Output: "
545            + str(self.get_output())
546            + " ["
547            + str(str(self.get_output_format()))
548            + "]"
549        )
550        log.info("Config: ")
551        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
552            "\n"
553        ):
554            log.info("\t" + str(d))
555        log.info("Param: ")
556        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
557            "\n"
558        ):
559            log.info("\t" + str(d))
560        log.info("Sample list: " + str(self.get_header_sample_list()))
561        log.info("Dataframe: ")
562        for d in str(df).split("\n"):
563            log.info("\t" + str(d))
564
565        # garbage collector
566        del df
567        gc.collect()
568
569        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
571    def get_stats(self) -> dict:
572        """
573        The `get_stats` function calculates and returns various statistics of the current object,
574        including information about the input file, variants, samples, header fields, quality, and
575        SNVs/InDels.
576        :return: a dictionary containing various statistics of the current object. The dictionary has
577        the following structure:
578        """
579
580        # Log
581        log.info(f"Stats Calculation...")
582
583        # table varaints
584        table_variants_from = self.get_table_variants()
585
586        # stats dict
587        stats = {"Infos": {}}
588
589        ### File
590        input_file = self.get_input()
591        stats["Infos"]["Input file"] = input_file
592
593        # Header
594        header_infos = self.get_header().infos
595        header_formats = self.get_header().formats
596        header_infos_list = list(header_infos)
597        header_formats_list = list(header_formats)
598
599        ### Variants
600
601        stats["Variants"] = {}
602
603        # Variants by chr
604        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
605        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
606        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
607            by=["CHROM"], kind="quicksort"
608        )
609
610        # Total number of variants
611        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
612
613        # Calculate percentage
614        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
615            lambda x: (x / nb_of_variants)
616        )
617
618        stats["Variants"]["Number of variants by chromosome"] = (
619            nb_of_variants_by_chrom.to_dict(orient="index")
620        )
621
622        stats["Infos"]["Number of variants"] = int(nb_of_variants)
623
624        ### Samples
625
626        # Init
627        samples = {}
628        nb_of_samples = 0
629
630        # Check Samples
631        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
632            log.debug(f"Check samples...")
633            for sample in self.get_header_sample_list():
634                sql_query_samples = f"""
635                    SELECT  '{sample}' as sample,
636                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
637                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
638                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
639                    FROM {table_variants_from}
640                    WHERE (
641                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
642                        AND
643                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
644                      )
645                    GROUP BY genotype
646                    """
647                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
648                sample_genotype_count = sql_query_genotype_df["count"].sum()
649                if len(sql_query_genotype_df):
650                    nb_of_samples += 1
651                    samples[f"{sample} - {sample_genotype_count} variants"] = (
652                        sql_query_genotype_df.to_dict(orient="index")
653                    )
654
655            stats["Samples"] = samples
656            stats["Infos"]["Number of samples"] = nb_of_samples
657
658        # #
659        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
660        #     stats["Infos"]["Number of samples"] = nb_of_samples
661        # elif nb_of_samples:
662        #     stats["Infos"]["Number of samples"] = "not a VCF format"
663
664        ### INFO and FORMAT fields
665        header_types_df = {}
666        header_types_list = {
667            "List of INFO fields": header_infos,
668            "List of FORMAT fields": header_formats,
669        }
670        i = 0
671        for header_type in header_types_list:
672
673            header_type_infos = header_types_list.get(header_type)
674            header_infos_dict = {}
675
676            for info in header_type_infos:
677
678                i += 1
679                header_infos_dict[i] = {}
680
681                # ID
682                header_infos_dict[i]["id"] = info
683
684                # num
685                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
686                if header_type_infos[info].num in genotype_map.keys():
687                    header_infos_dict[i]["Number"] = genotype_map.get(
688                        header_type_infos[info].num
689                    )
690                else:
691                    header_infos_dict[i]["Number"] = header_type_infos[info].num
692
693                # type
694                if header_type_infos[info].type:
695                    header_infos_dict[i]["Type"] = header_type_infos[info].type
696                else:
697                    header_infos_dict[i]["Type"] = "."
698
699                # desc
700                if header_type_infos[info].desc != None:
701                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
702                else:
703                    header_infos_dict[i]["Description"] = ""
704
705            if len(header_infos_dict):
706                header_types_df[header_type] = pd.DataFrame.from_dict(
707                    header_infos_dict, orient="index"
708                ).to_dict(orient="index")
709
710        # Stats
711        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
712        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
713        stats["Header"] = header_types_df
714
715        ### QUAL
716        if "QUAL" in self.get_header_columns():
717            sql_query_qual = f"""
718                    SELECT
719                        avg(CAST(QUAL AS INTEGER)) AS Average,
720                        min(CAST(QUAL AS INTEGER)) AS Minimum,
721                        max(CAST(QUAL AS INTEGER)) AS Maximum,
722                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
723                        median(CAST(QUAL AS INTEGER)) AS Median,
724                        variance(CAST(QUAL AS INTEGER)) AS Variance
725                    FROM {table_variants_from}
726                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
727                    """
728
729            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
730            stats["Quality"] = {"Stats": qual}
731
732        ### SNV and InDel
733
734        sql_query_snv = f"""
735            
736            SELECT Type, count FROM (
737
738                    SELECT
739                        'Total' AS Type,
740                        count(*) AS count
741                    FROM {table_variants_from}
742
743                    UNION
744
745                    SELECT
746                        'MNV' AS Type,
747                        count(*) AS count
748                    FROM {table_variants_from}
749                    WHERE len(REF) > 1 AND len(ALT) > 1
750                    AND len(REF) = len(ALT)
751
752                    UNION
753
754                    SELECT
755                        'InDel' AS Type,
756                        count(*) AS count
757                    FROM {table_variants_from}
758                    WHERE len(REF) > 1 OR len(ALT) > 1
759                    AND len(REF) != len(ALT)
760                    
761                    UNION
762
763                    SELECT
764                        'SNV' AS Type,
765                        count(*) AS count
766                    FROM {table_variants_from}
767                    WHERE len(REF) = 1 AND len(ALT) = 1
768
769                )
770
771            ORDER BY count DESC
772
773                """
774        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
775
776        sql_query_snv_substitution = f"""
777                SELECT
778                    concat(REF, '>', ALT) AS 'Substitution',
779                    count(*) AS count
780                FROM {table_variants_from}
781                WHERE len(REF) = 1 AND len(ALT) = 1
782                GROUP BY REF, ALT
783                ORDER BY count(*) DESC
784                """
785        snv_substitution = (
786            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
787        )
788        stats["Variants"]["Counts"] = snv_indel
789        stats["Variants"]["Substitutions"] = snv_substitution
790
791        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
793    def stats_to_file(self, file: str = None) -> str:
794        """
795        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
796        into a JSON object, and writes the JSON object to the specified file.
797
798        :param file: The `file` parameter is a string that represents the file path where the JSON data
799        will be written
800        :type file: str
801        :return: the name of the file that was written to.
802        """
803
804        # Get stats
805        stats = self.get_stats()
806
807        # Serializing json
808        json_object = json.dumps(stats, indent=4)
809
810        # Writing to sample.json
811        with open(file, "w") as outfile:
812            outfile.write(json_object)
813
814        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
816    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
817        """
818        The `print_stats` function generates a markdown file and prints the statistics contained in a
819        JSON file in a formatted manner.
820
821        :param output_file: The `output_file` parameter is a string that specifies the path and filename
822        of the output file where the stats will be printed in Markdown format. If no `output_file` is
823        provided, a temporary directory will be created and the stats will be saved in a file named
824        "stats.md" within that
825        :type output_file: str
826        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
827        file where the statistics will be saved. If no value is provided, a temporary directory will be
828        created and a default file name "stats.json" will be used
829        :type json_file: str
830        :return: The function `print_stats` does not return any value. It has a return type annotation
831        of `None`.
832        """
833
834        # Full path
835        output_file = full_path(output_file)
836        json_file = full_path(json_file)
837
838        with tempfile.TemporaryDirectory() as tmpdir:
839
840            # Files
841            if not output_file:
842                output_file = os.path.join(tmpdir, "stats.md")
843            if not json_file:
844                json_file = os.path.join(tmpdir, "stats.json")
845
846            # Create folders
847            if not os.path.exists(os.path.dirname(output_file)):
848                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
849            if not os.path.exists(os.path.dirname(json_file)):
850                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
851
852            # Create stats JSON file
853            stats_file = self.stats_to_file(file=json_file)
854
855            # Print stats file
856            with open(stats_file) as f:
857                stats = yaml.safe_load(f)
858
859            # Output
860            output_title = []
861            output_index = []
862            output = []
863
864            # Title
865            output_title.append("# HOWARD Stats")
866
867            # Index
868            output_index.append("## Index")
869
870            # Process sections
871            for section in stats:
872                infos = stats.get(section)
873                section_link = "#" + section.lower().replace(" ", "-")
874                output.append(f"## {section}")
875                output_index.append(f"- [{section}]({section_link})")
876
877                if len(infos):
878                    for info in infos:
879                        try:
880                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
881                            is_df = True
882                        except:
883                            try:
884                                df = pd.DataFrame.from_dict(
885                                    json.loads((infos.get(info))), orient="index"
886                                )
887                                is_df = True
888                            except:
889                                is_df = False
890                        if is_df:
891                            output.append(f"### {info}")
892                            info_link = "#" + info.lower().replace(" ", "-")
893                            output_index.append(f"   - [{info}]({info_link})")
894                            output.append(f"{df.to_markdown(index=False)}")
895                        else:
896                            output.append(f"- {info}: {infos.get(info)}")
897                else:
898                    output.append(f"NA")
899
900            # Write stats in markdown file
901            with open(output_file, "w") as fp:
902                for item in output_title:
903                    fp.write("%s\n" % item)
904                for item in output_index:
905                    fp.write("%s\n" % item)
906                for item in output:
907                    fp.write("%s\n" % item)
908
909            # Output stats in markdown
910            print("")
911            print("\n\n".join(output_title))
912            print("")
913            print("\n\n".join(output))
914            print("")
915
916        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
918    def get_input(self) -> str:
919        """
920        It returns the value of the input variable.
921        :return: The input is being returned.
922        """
923        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
925    def get_input_format(self, input_file: str = None) -> str:
926        """
927        This function returns the format of the input variable, either from the provided input file or
928        by prompting for input.
929
930        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
931        represents the file path of the input file. If no `input_file` is provided when calling the
932        method, it will default to `None`
933        :type input_file: str
934        :return: The format of the input variable is being returned.
935        """
936
937        if not input_file:
938            input_file = self.get_input()
939        input_format = get_file_format(input_file)
940        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
942    def get_input_compressed(self, input_file: str = None) -> str:
943        """
944        The function `get_input_compressed` returns the format of the input variable after compressing
945        it.
946
947        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
948        that represents the file path of the input file. If no `input_file` is provided when calling the
949        method, it will default to `None` and the method will then call `self.get_input()` to
950        :type input_file: str
951        :return: The function `get_input_compressed` returns the compressed format of the input
952        variable.
953        """
954
955        if not input_file:
956            input_file = self.get_input()
957        input_compressed = get_file_compressed(input_file)
958        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
960    def get_output(self) -> str:
961        """
962        It returns the output of the neuron.
963        :return: The output of the neural network.
964        """
965
966        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
968    def get_output_format(self, output_file: str = None) -> str:
969        """
970        The function `get_output_format` returns the format of the input variable or the output file if
971        provided.
972
973        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
974        that represents the file path of the output file. If no `output_file` is provided when calling
975        the method, it will default to the output obtained from the `get_output` method of the class
976        instance. The
977        :type output_file: str
978        :return: The format of the input variable is being returned.
979        """
980
981        if not output_file:
982            output_file = self.get_output()
983        output_format = get_file_format(output_file)
984
985        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
987    def get_config(self) -> dict:
988        """
989        It returns the config
990        :return: The config variable is being returned.
991        """
992        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
994    def get_param(self) -> dict:
995        """
996        It returns the param
997        :return: The param variable is being returned.
998        """
999        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
1001    def get_connexion_db(self) -> str:
1002        """
1003        It returns the connexion_db attribute of the object
1004        :return: The connexion_db is being returned.
1005        """
1006        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
1008    def get_prefix(self) -> str:
1009        """
1010        It returns the prefix of the object.
1011        :return: The prefix is being returned.
1012        """
1013        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
1015    def get_table_variants(self, clause: str = "select") -> str:
1016        """
1017        This function returns the table_variants attribute of the object
1018
1019        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
1020        defaults to select (optional)
1021        :return: The table_variants attribute of the object.
1022        """
1023
1024        # Access
1025        access = self.get_config().get("access", None)
1026
1027        # Clauses "select", "where", "update"
1028        if clause in ["select", "where", "update"]:
1029            table_variants = self.table_variants
1030        # Clause "from"
1031        elif clause in ["from"]:
1032            # For Read Only
1033            if self.get_input_format() in ["parquet"] and access in ["RO"]:
1034                input_file = self.get_input()
1035                table_variants = f"'{input_file}' as variants"
1036            # For Read Write
1037            else:
1038                table_variants = f"{self.table_variants} as variants"
1039        else:
1040            table_variants = self.table_variants
1041        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
1043    def get_tmp_dir(self) -> str:
1044        """
1045        The function `get_tmp_dir` returns the temporary directory path based on configuration
1046        parameters or a default path.
1047        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1048        configuration, parameters, and a default value of "/tmp".
1049        """
1050
1051        return get_tmp(
1052            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1053        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1055    def get_connexion_type(self) -> str:
1056        """
1057        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1058
1059        :return: The connexion type is being returned.
1060        """
1061        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1063    def get_connexion(self):
1064        """
1065        It returns the connection object
1066
1067        :return: The connection object.
1068        """
1069        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1071    def close_connexion(self) -> None:
1072        """
1073        This function closes the connection to the database.
1074        :return: The connection is being closed.
1075        """
1076        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1078    def get_header(self, type: str = "vcf"):
1079        """
1080        This function returns the header of the VCF file as a list of strings
1081
1082        :param type: the type of header you want to get, defaults to vcf (optional)
1083        :return: The header of the vcf file.
1084        """
1085
1086        if self.header_vcf:
1087            if type == "vcf":
1088                return self.header_vcf
1089            elif type == "list":
1090                return self.header_list
1091        else:
1092            if type == "vcf":
1093                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1094                return header
1095            elif type == "list":
1096                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_infos_list(self) -> list:
1098    def get_header_infos_list(self) -> list:
1099        """
1100        This function retrieves a list of information fields from the header.
1101        :return: A list of information fields from the header.
1102        """
1103
1104        # Init
1105        infos_list = []
1106
1107        for field in self.get_header().infos:
1108            infos_list.append(field)
1109
1110        return infos_list

This function retrieves a list of information fields from the header.

Returns

A list of information fields from the header.

def get_header_length(self, file: str = None) -> int:
1112    def get_header_length(self, file: str = None) -> int:
1113        """
1114        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1115        line.
1116
1117        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1118        header file. If this argument is provided, the function will read the header from the specified
1119        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1120        :type file: str
1121        :return: the length of the header list, excluding the #CHROM line.
1122        """
1123
1124        if file:
1125            return len(self.read_vcf_header_file(file=file)) - 1
1126        elif self.get_header(type="list"):
1127            return len(self.get_header(type="list")) - 1
1128        else:
1129            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1131    def get_header_columns(self) -> str:
1132        """
1133        This function returns the header list of a VCF
1134
1135        :return: The length of the header list.
1136        """
1137        if self.get_header():
1138            return self.get_header(type="list")[-1]
1139        else:
1140            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1142    def get_header_columns_as_list(self) -> list:
1143        """
1144        This function returns the header list of a VCF
1145
1146        :return: The length of the header list.
1147        """
1148        if self.get_header():
1149            return self.get_header_columns().strip().split("\t")
1150        else:
1151            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1153    def get_header_columns_as_sql(self) -> str:
1154        """
1155        This function retruns header length (without #CHROM line)
1156
1157        :return: The length of the header list.
1158        """
1159        sql_column_list = []
1160        for col in self.get_header_columns_as_list():
1161            sql_column_list.append(f'"{col}"')
1162        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list( self, check: bool = False, samples: list = None, samples_force: bool = False) -> list:
1164    def get_header_sample_list(
1165        self, check: bool = False, samples: list = None, samples_force: bool = False
1166    ) -> list:
1167        """
1168        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
1169        checking and filtering based on input parameters.
1170
1171        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
1172        parameter that determines whether to check if the samples in the list are properly defined as
1173        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
1174        list is defined as a, defaults to False
1175        :type check: bool (optional)
1176        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
1177        allows you to specify a subset of samples from the header. If you provide a list of sample
1178        names, the function will check if each sample is defined in the header. If a sample is not found
1179        in the
1180        :type samples: list
1181        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
1182        a boolean parameter that determines whether to force the function to return the sample list
1183        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
1184        function will return the sample list without performing, defaults to False
1185        :type samples_force: bool (optional)
1186        :return: The function `get_header_sample_list` returns a list of samples based on the input
1187        parameters and conditions specified in the function.
1188        """
1189
1190        # Init
1191        samples_list = []
1192
1193        if samples is None:
1194            samples_list = self.header_vcf.samples
1195        else:
1196            samples_checked = []
1197            for sample in samples:
1198                if sample in self.header_vcf.samples:
1199                    samples_checked.append(sample)
1200                else:
1201                    log.warning(f"Sample '{sample}' not defined in header")
1202            samples_list = samples_checked
1203
1204            # Force sample list without checking if is_genotype_column
1205            if samples_force:
1206                log.warning(f"Samples {samples_list} not checked if genotypes")
1207                return samples_list
1208
1209        if check:
1210            samples_checked = []
1211            for sample in samples_list:
1212                if self.is_genotype_column(column=sample):
1213                    samples_checked.append(sample)
1214                else:
1215                    log.warning(
1216                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
1217                    )
1218            samples_list = samples_checked
1219
1220        # Return samples list
1221        return samples_list

The function get_header_sample_list returns a list of samples from a VCF header, with optional checking and filtering based on input parameters.

Parameters
  • check: The check parameter in the get_header_sample_list function is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. If check is set to True, the function will verify if each sample in the list is defined as a, defaults to False
  • samples: The samples parameter in the get_header_sample_list function is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the
  • samples_force: The samples_force parameter in the get_header_sample_list function is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. If samples_force is set to True, the function will return the sample list without performing, defaults to False
Returns

The function get_header_sample_list returns a list of samples based on the input parameters and conditions specified in the function.

def is_genotype_column(self, column: str = None) -> bool:
1223    def is_genotype_column(self, column: str = None) -> bool:
1224        """
1225        This function checks if a given column is a genotype column in a database.
1226
1227        :param column: The `column` parameter in the `is_genotype_column` method is a string that
1228        represents the column name in a database table. This method checks if the specified column is a
1229        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
1230        method of
1231        :type column: str
1232        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
1233        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
1234        column name and returns the result. If the `column` parameter is None, it returns False.
1235        """
1236
1237        if column is not None:
1238            return Database(database=self.get_input()).is_genotype_column(column=column)
1239        else:
1240            return False

This function checks if a given column is a genotype column in a database.

Parameters
  • column: The column parameter in the is_genotype_column method is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls the is_genotype_column method of
Returns

The is_genotype_column method is returning a boolean value. If the column parameter is not None, it calls the is_genotype_column method of the Database class with the specified column name and returns the result. If the column parameter is None, it returns False.

def get_verbose(self) -> bool:
1242    def get_verbose(self) -> bool:
1243        """
1244        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1245        exist
1246
1247        :return: The value of the key "verbose" in the config dictionary.
1248        """
1249        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1251    def get_connexion_format(self) -> str:
1252        """
1253        It returns the connexion format of the object.
1254        :return: The connexion_format is being returned.
1255        """
1256        connexion_format = self.connexion_format
1257        if connexion_format not in ["duckdb", "sqlite"]:
1258            log.error(f"Unknown connexion format {connexion_format}")
1259            raise ValueError(f"Unknown connexion format {connexion_format}")
1260        else:
1261            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1263    def insert_file_to_table(
1264        self,
1265        file,
1266        columns: str,
1267        header_len: int = 0,
1268        sep: str = "\t",
1269        chunksize: int = 1000000,
1270    ) -> None:
1271        """
1272        The function reads a file in chunks and inserts each chunk into a table based on the specified
1273        database format.
1274
1275        :param file: The `file` parameter is the file that you want to load into a table. It should be
1276        the path to the file on your system
1277        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1278        should contain the names of the columns in the table where the data will be inserted. The column
1279        names should be separated by commas within the string. For example, if you have columns named
1280        "id", "name
1281        :type columns: str
1282        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1283        the number of lines to skip at the beginning of the file before reading the actual data. This
1284        parameter allows you to skip any header information present in the file before processing the
1285        data, defaults to 0
1286        :type header_len: int (optional)
1287        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1288        separator character that is used in the file being read. In this case, the default separator is
1289        set to `\t`, which represents a tab character. You can change this parameter to a different
1290        separator character if, defaults to \t
1291        :type sep: str (optional)
1292        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1293        when processing the file in chunks. In the provided code snippet, the default value for
1294        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1295        to 1000000
1296        :type chunksize: int (optional)
1297        """
1298
1299        # Config
1300        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1301        connexion_format = self.get_connexion_format()
1302
1303        log.debug("chunksize: " + str(chunksize))
1304
1305        if chunksize:
1306            for chunk in pd.read_csv(
1307                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1308            ):
1309                if connexion_format in ["duckdb"]:
1310                    sql_insert_into = (
1311                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1312                    )
1313                    self.conn.execute(sql_insert_into)
1314                elif connexion_format in ["sqlite"]:
1315                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1317    def load_data(
1318        self,
1319        input_file: str = None,
1320        drop_variants_table: bool = False,
1321        sample_size: int = 20480,
1322    ) -> None:
1323        """
1324        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1325        table before loading the data and specify a sample size.
1326
1327        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1328        table
1329        :type input_file: str
1330        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1331        determines whether the variants table should be dropped before loading the data. If set to
1332        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1333        not be dropped, defaults to False
1334        :type drop_variants_table: bool (optional)
1335        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1336        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1337        20480
1338        :type sample_size: int (optional)
1339        """
1340
1341        log.info("Loading...")
1342
1343        # change input file
1344        if input_file:
1345            self.set_input(input_file)
1346            self.set_header()
1347
1348        # drop variants table
1349        if drop_variants_table:
1350            self.drop_variants_table()
1351
1352        # get table variants
1353        table_variants = self.get_table_variants()
1354
1355        # Access
1356        access = self.get_config().get("access", None)
1357        log.debug(f"access: {access}")
1358
1359        # Input format and compress
1360        input_format = self.get_input_format()
1361        input_compressed = self.get_input_compressed()
1362        log.debug(f"input_format: {input_format}")
1363        log.debug(f"input_compressed: {input_compressed}")
1364
1365        # input_compressed_format
1366        if input_compressed:
1367            input_compressed_format = "gzip"
1368        else:
1369            input_compressed_format = "none"
1370        log.debug(f"input_compressed_format: {input_compressed_format}")
1371
1372        # Connexion format
1373        connexion_format = self.get_connexion_format()
1374
1375        # Sample size
1376        if not sample_size:
1377            sample_size = -1
1378        log.debug(f"sample_size: {sample_size}")
1379
1380        # Load data
1381        log.debug(f"Load Data from {input_format}")
1382
1383        # DuckDB connexion
1384        if connexion_format in ["duckdb"]:
1385
1386            # Database already exists
1387            if self.input_format in ["db", "duckdb"]:
1388
1389                if connexion_format in ["duckdb"]:
1390                    log.debug(f"Input file format '{self.input_format}' duckDB")
1391                else:
1392                    log.error(
1393                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1394                    )
1395                    raise ValueError(
1396                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1397                    )
1398
1399            # Load from existing database format
1400            else:
1401
1402                try:
1403                    # Create Table or View
1404                    database = Database(database=self.input)
1405                    sql_from = database.get_sql_from(sample_size=sample_size)
1406
1407                    if access in ["RO"]:
1408                        sql_load = (
1409                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1410                        )
1411                    else:
1412                        sql_load = (
1413                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1414                        )
1415                    self.conn.execute(sql_load)
1416
1417                except:
1418                    # Format not available
1419                    log.error(f"Input file format '{self.input_format}' not available")
1420                    raise ValueError(
1421                        f"Input file format '{self.input_format}' not available"
1422                    )
1423
1424        # SQLite connexion
1425        elif connexion_format in ["sqlite"] and input_format in [
1426            "vcf",
1427            "tsv",
1428            "csv",
1429            "psv",
1430        ]:
1431
1432            # Main structure
1433            structure = {
1434                "#CHROM": "VARCHAR",
1435                "POS": "INTEGER",
1436                "ID": "VARCHAR",
1437                "REF": "VARCHAR",
1438                "ALT": "VARCHAR",
1439                "QUAL": "VARCHAR",
1440                "FILTER": "VARCHAR",
1441                "INFO": "VARCHAR",
1442            }
1443
1444            # Strcuture with samples
1445            structure_complete = structure
1446            if self.get_header_sample_list():
1447                structure["FORMAT"] = "VARCHAR"
1448                for sample in self.get_header_sample_list():
1449                    structure_complete[sample] = "VARCHAR"
1450
1451            # Columns list for create and insert
1452            sql_create_table_columns = []
1453            sql_create_table_columns_list = []
1454            for column in structure_complete:
1455                column_type = structure_complete[column]
1456                sql_create_table_columns.append(
1457                    f'"{column}" {column_type} default NULL'
1458                )
1459                sql_create_table_columns_list.append(f'"{column}"')
1460
1461            # Create database
1462            log.debug(f"Create Table {table_variants}")
1463            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1464            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1465            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1466            self.conn.execute(sql_create_table)
1467
1468            # chunksize define length of file chunk load file
1469            chunksize = 100000
1470
1471            # delimiter
1472            delimiter = file_format_delimiters.get(input_format, "\t")
1473
1474            # Load the input file
1475            with open(self.input, "rt") as input_file:
1476
1477                # Use the appropriate file handler based on the input format
1478                if input_compressed:
1479                    input_file = bgzf.open(self.input, "rt")
1480                if input_format in ["vcf"]:
1481                    header_len = self.get_header_length()
1482                else:
1483                    header_len = 0
1484
1485                # Insert the file contents into a table
1486                self.insert_file_to_table(
1487                    input_file,
1488                    columns=sql_create_table_columns_list_sql,
1489                    header_len=header_len,
1490                    sep=delimiter,
1491                    chunksize=chunksize,
1492                )
1493
1494        else:
1495            log.error(
1496                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1497            )
1498            raise ValueError(
1499                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1500            )
1501
1502        # Explode INFOS fields into table fields
1503        if self.get_explode_infos():
1504            self.explode_infos(
1505                prefix=self.get_explode_infos_prefix(),
1506                fields=self.get_explode_infos_fields(),
1507                force=True,
1508            )
1509
1510        # Create index after insertion
1511        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1513    def get_explode_infos(self) -> bool:
1514        """
1515        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1516        to False if it is not set.
1517        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1518        value. If the parameter is not present, it will return False.
1519        """
1520
1521        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1523    def get_explode_infos_fields(
1524        self,
1525        explode_infos_fields: str = None,
1526        remove_fields_not_in_header: bool = False,
1527    ) -> list:
1528        """
1529        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1530        the input parameter `explode_infos_fields`.
1531
1532        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1533        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1534        comma-separated list of field names to explode
1535        :type explode_infos_fields: str
1536        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1537        flag that determines whether to remove fields that are not present in the header. If it is set
1538        to `True`, any field that is not in the header will be excluded from the list of exploded
1539        information fields. If it is set to `, defaults to False
1540        :type remove_fields_not_in_header: bool (optional)
1541        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1542        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1543        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1544        Otherwise, it returns a list of exploded information fields after removing any spaces and
1545        splitting the string by commas.
1546        """
1547
1548        # If no fields, get it in param
1549        if not explode_infos_fields:
1550            explode_infos_fields = (
1551                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1552            )
1553
1554        # If no fields, defined as all fields in header using keyword
1555        if not explode_infos_fields:
1556            explode_infos_fields = "*"
1557
1558        # If fields list not empty
1559        if explode_infos_fields:
1560
1561            # Input fields list
1562            if isinstance(explode_infos_fields, str):
1563                fields_input = explode_infos_fields.split(",")
1564            elif isinstance(explode_infos_fields, list):
1565                fields_input = explode_infos_fields
1566            else:
1567                fields_input = []
1568
1569            # Fields list without * keyword
1570            fields_without_all = fields_input.copy()
1571            if "*".casefold() in (item.casefold() for item in fields_without_all):
1572                fields_without_all.remove("*")
1573
1574            # Fields in header
1575            fields_in_header = sorted(list(set(self.get_header().infos)))
1576
1577            # Construct list of fields
1578            fields_output = []
1579            for field in fields_input:
1580
1581                # Strip field
1582                field = field.strip()
1583
1584                # format keyword * in regex
1585                if field.upper() in ["*"]:
1586                    field = ".*"
1587
1588                # Find all fields with pattern
1589                r = re.compile(field)
1590                fields_search = sorted(list(filter(r.match, fields_in_header)))
1591
1592                # Remove fields input from search
1593                if field in fields_search:
1594                    fields_search = [field]
1595                elif fields_search != [field]:
1596                    fields_search = sorted(
1597                        list(set(fields_search).difference(fields_input))
1598                    )
1599
1600                # If field is not in header (avoid not well formatted header)
1601                if not fields_search and not remove_fields_not_in_header:
1602                    fields_search = [field]
1603
1604                # Add found fields
1605                for new_field in fields_search:
1606                    # Add field, if not already exists, and if it is in header (if asked)
1607                    if (
1608                        new_field not in fields_output
1609                        and (
1610                            not remove_fields_not_in_header
1611                            or new_field in fields_in_header
1612                        )
1613                        and new_field not in [".*"]
1614                    ):
1615                        fields_output.append(new_field)
1616
1617            return fields_output
1618
1619        else:
1620
1621            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1623    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1624        """
1625        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1626        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1627        not provided.
1628
1629        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1630        prefix to be used for exploding or expanding information
1631        :type explode_infos_prefix: str
1632        :return: the value of the variable `explode_infos_prefix`.
1633        """
1634
1635        if not explode_infos_prefix:
1636            explode_infos_prefix = (
1637                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1638            )
1639
1640        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1642    def add_column(
1643        self,
1644        table_name,
1645        column_name,
1646        column_type,
1647        default_value=None,
1648        drop: bool = False,
1649    ) -> dict:
1650        """
1651        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1652        doesn't already exist.
1653
1654        :param table_name: The name of the table to which you want to add a column
1655        :param column_name: The parameter "column_name" is the name of the column that you want to add
1656        to the table
1657        :param column_type: The `column_type` parameter specifies the data type of the column that you
1658        want to add to the table. It should be a string that represents the desired data type, such as
1659        "INTEGER", "TEXT", "REAL", etc
1660        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1661        default value for the newly added column. If a default value is provided, it will be assigned to
1662        the column for any existing rows that do not have a value for that column
1663        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1664        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1665        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1666        to False
1667        :type drop: bool (optional)
1668        :return: a boolean value indicating whether the column was successfully added to the table.
1669        """
1670
1671        # added
1672        added = False
1673        dropped = False
1674
1675        # Check if the column already exists in the table
1676        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1677        columns = self.get_query_to_df(query).columns.tolist()
1678        if column_name.upper() in [c.upper() for c in columns]:
1679            log.debug(
1680                f"The {column_name} column already exists in the {table_name} table"
1681            )
1682            if drop:
1683                self.drop_column(table_name=table_name, column_name=column_name)
1684                dropped = True
1685            else:
1686                return None
1687        else:
1688            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1689
1690        # Add column in table
1691        add_column_query = (
1692            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1693        )
1694        if default_value is not None:
1695            add_column_query += f" DEFAULT {default_value}"
1696        self.execute_query(add_column_query)
1697        added = not dropped
1698        log.debug(
1699            f"The {column_name} column was successfully added to the {table_name} table"
1700        )
1701
1702        if added:
1703            added_column = {
1704                "table_name": table_name,
1705                "column_name": column_name,
1706                "column_type": column_type,
1707                "default_value": default_value,
1708            }
1709        else:
1710            added_column = None
1711
1712        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1714    def drop_column(
1715        self, column: dict = None, table_name: str = None, column_name: str = None
1716    ) -> bool:
1717        """
1718        The `drop_column` function drops a specified column from a given table in a database and returns
1719        True if the column was successfully dropped, and False if the column does not exist in the
1720        table.
1721
1722        :param column: The `column` parameter is a dictionary that contains information about the column
1723        you want to drop. It has two keys:
1724        :type column: dict
1725        :param table_name: The `table_name` parameter is the name of the table from which you want to
1726        drop a column
1727        :type table_name: str
1728        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1729        from the table
1730        :type column_name: str
1731        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1732        and False if the column does not exist in the table.
1733        """
1734
1735        # Find column infos
1736        if column:
1737            if isinstance(column, dict):
1738                table_name = column.get("table_name", None)
1739                column_name = column.get("column_name", None)
1740            elif isinstance(column, str):
1741                table_name = self.get_table_variants()
1742                column_name = column
1743            else:
1744                table_name = None
1745                column_name = None
1746
1747        if not table_name and not column_name:
1748            return False
1749
1750        # Removed
1751        removed = False
1752
1753        # Check if the column already exists in the table
1754        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1755        columns = self.get_query_to_df(query).columns.tolist()
1756        if column_name in columns:
1757            log.debug(f"The {column_name} column exists in the {table_name} table")
1758        else:
1759            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1760            return False
1761
1762        # Add column in table # ALTER TABLE integers DROP k
1763        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1764        self.execute_query(add_column_query)
1765        removed = True
1766        log.debug(
1767            f"The {column_name} column was successfully dropped to the {table_name} table"
1768        )
1769
1770        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1772    def explode_infos(
1773        self,
1774        prefix: str = None,
1775        create_index: bool = False,
1776        fields: list = None,
1777        force: bool = False,
1778        proccess_all_fields_together: bool = False,
1779        table: str = None,
1780    ) -> list:
1781        """
1782        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1783        individual columns, returning a list of added columns.
1784
1785        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1786        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1787        `self.get_explode_infos_prefix()` as the prefix
1788        :type prefix: str
1789        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1790        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1791        `False`, indexes will not be created. The default value is `False`, defaults to False
1792        :type create_index: bool (optional)
1793        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1794        that you want to explode into individual columns. If this parameter is not provided, all INFO
1795        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1796        a list to the `
1797        :type fields: list
1798        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1799        determines whether to drop and recreate a column if it already exists in the table. If `force`
1800        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1801        defaults to False
1802        :type force: bool (optional)
1803        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1804        flag that determines whether to process all the INFO fields together or individually. If set to
1805        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1806        be processed individually. The default value is, defaults to False
1807        :type proccess_all_fields_together: bool (optional)
1808        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1809        of the table where the exploded INFO fields will be added as individual columns. If you provide
1810        a value for the `table` parameter, the function will use that table name. If the `table`
1811        parameter is
1812        :type table: str
1813        :return: The `explode_infos` function returns a list of added columns.
1814        """
1815
1816        # drop indexes
1817        self.drop_indexes()
1818
1819        # connexion format
1820        connexion_format = self.get_connexion_format()
1821
1822        # Access
1823        access = self.get_config().get("access", None)
1824
1825        # Added columns
1826        added_columns = []
1827
1828        if access not in ["RO"]:
1829
1830            # prefix
1831            if prefix in [None, True] or not isinstance(prefix, str):
1832                if self.get_explode_infos_prefix() not in [None, True]:
1833                    prefix = self.get_explode_infos_prefix()
1834                else:
1835                    prefix = "INFO/"
1836
1837            # table variants
1838            if table is not None:
1839                table_variants = table
1840            else:
1841                table_variants = self.get_table_variants(clause="select")
1842
1843            # extra infos
1844            try:
1845                extra_infos = self.get_extra_infos()
1846            except:
1847                extra_infos = []
1848
1849            # Header infos
1850            header_infos = self.get_header().infos
1851
1852            log.debug(
1853                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1854            )
1855
1856            sql_info_alter_table_array = []
1857
1858            # Info fields to check
1859            fields_list = list(header_infos)
1860            if fields:
1861                fields_list += fields
1862            fields_list = set(fields_list)
1863
1864            # If no fields
1865            if not fields:
1866                fields = []
1867
1868            # Translate fields if patterns
1869            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1870
1871            for info in fields:
1872
1873                info_id_sql = prefix + info
1874
1875                if (
1876                    info in fields_list
1877                    or prefix + info in fields_list
1878                    or info in extra_infos
1879                ):
1880
1881                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1882
1883                    if info in header_infos:
1884                        info_type = header_infos[info].type
1885                        info_num = header_infos[info].num
1886                    else:
1887                        info_type = "String"
1888                        info_num = 0
1889
1890                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1891                    if info_num != 1:
1892                        type_sql = "VARCHAR"
1893
1894                    # Add field
1895                    added_column = self.add_column(
1896                        table_name=table_variants,
1897                        column_name=info_id_sql,
1898                        column_type=type_sql,
1899                        default_value="null",
1900                        drop=force,
1901                    )
1902
1903                    if added_column:
1904                        added_columns.append(added_column)
1905
1906                    if added_column or force:
1907
1908                        # add field to index
1909                        self.index_additionnal_fields.append(info_id_sql)
1910
1911                        # Update field array
1912                        if connexion_format in ["duckdb"]:
1913                            update_info_field = f"""
1914                            "{info_id_sql}" =
1915                                CASE
1916                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1917                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1918                                END
1919                            """
1920                        elif connexion_format in ["sqlite"]:
1921                            update_info_field = f"""
1922                                "{info_id_sql}" =
1923                                    CASE
1924                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1925                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1926                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1927                                    END
1928                            """
1929
1930                        sql_info_alter_table_array.append(update_info_field)
1931
1932            if sql_info_alter_table_array:
1933
1934                # By chromosomes
1935                try:
1936                    chromosomes_list = list(
1937                        self.get_query_to_df(
1938                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1939                        )["#CHROM"]
1940                    )
1941                except:
1942                    chromosomes_list = [None]
1943
1944                for chrom in chromosomes_list:
1945                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1946
1947                    # Where clause
1948                    where_clause = ""
1949                    if chrom and len(chromosomes_list) > 1:
1950                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1951
1952                    # Update table
1953                    if proccess_all_fields_together:
1954                        sql_info_alter_table_array_join = ", ".join(
1955                            sql_info_alter_table_array
1956                        )
1957                        if sql_info_alter_table_array_join:
1958                            sql_info_alter_table = f"""
1959                                UPDATE {table_variants}
1960                                SET {sql_info_alter_table_array_join}
1961                                {where_clause}
1962                                """
1963                            log.debug(
1964                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1965                            )
1966                            # log.debug(sql_info_alter_table)
1967                            self.conn.execute(sql_info_alter_table)
1968                    else:
1969                        sql_info_alter_num = 0
1970                        for sql_info_alter in sql_info_alter_table_array:
1971                            sql_info_alter_num += 1
1972                            sql_info_alter_table = f"""
1973                                UPDATE {table_variants}
1974                                SET {sql_info_alter}
1975                                {where_clause}
1976                                """
1977                            log.debug(
1978                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1979                            )
1980                            # log.debug(sql_info_alter_table)
1981                            self.conn.execute(sql_info_alter_table)
1982
1983        # create indexes
1984        if create_index:
1985            self.create_indexes()
1986
1987        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1989    def create_indexes(self) -> None:
1990        """
1991        Create indexes on the table after insertion
1992        """
1993
1994        # Access
1995        access = self.get_config().get("access", None)
1996
1997        # get table variants
1998        table_variants = self.get_table_variants("FROM")
1999
2000        if self.get_indexing() and access not in ["RO"]:
2001            # Create index
2002            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
2003            self.conn.execute(sql_create_table_index)
2004            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
2005            self.conn.execute(sql_create_table_index)
2006            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
2007            self.conn.execute(sql_create_table_index)
2008            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
2009            self.conn.execute(sql_create_table_index)
2010            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
2011            self.conn.execute(sql_create_table_index)
2012            for field in self.index_additionnal_fields:
2013                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
2014                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
2016    def drop_indexes(self) -> None:
2017        """
2018        Create indexes on the table after insertion
2019        """
2020
2021        # Access
2022        access = self.get_config().get("access", None)
2023
2024        # get table variants
2025        table_variants = self.get_table_variants("FROM")
2026
2027        # Get database format
2028        connexion_format = self.get_connexion_format()
2029
2030        if access not in ["RO"]:
2031            if connexion_format in ["duckdb"]:
2032                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
2033            elif connexion_format in ["sqlite"]:
2034                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
2035
2036            list_indexes = self.conn.execute(sql_list_indexes)
2037            index_names = [row[0] for row in list_indexes.fetchall()]
2038            for index in index_names:
2039                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
2040                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
2042    def read_vcf_header(self, f) -> list:
2043        """
2044        It reads the header of a VCF file and returns a list of the header lines
2045
2046        :param f: the file object
2047        :return: The header lines of the VCF file.
2048        """
2049
2050        header_list = []
2051        for line in f:
2052            header_list.append(line)
2053            if line.startswith("#CHROM"):
2054                break
2055        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
2057    def read_vcf_header_file(self, file: str = None) -> list:
2058        """
2059        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
2060        uncompressed files.
2061
2062        :param file: The `file` parameter is a string that represents the path to the VCF header file
2063        that you want to read. It is an optional parameter, so if you don't provide a value, it will
2064        default to `None`
2065        :type file: str
2066        :return: The function `read_vcf_header_file` returns a list.
2067        """
2068
2069        if self.get_input_compressed(input_file=file):
2070            with bgzf.open(file, "rt") as f:
2071                return self.read_vcf_header(f=f)
2072        else:
2073            with open(file, "rt") as f:
2074                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
2076    def execute_query(self, query: str):
2077        """
2078        It takes a query as an argument, executes it, and returns the results
2079
2080        :param query: The query to be executed
2081        :return: The result of the query is being returned.
2082        """
2083        if query:
2084            return self.conn.execute(query)  # .fetchall()
2085        else:
2086            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None, fields_to_rename: dict | None = None) -> bool:
2088    def export_output(
2089        self,
2090        output_file: str | None = None,
2091        output_header: str | None = None,
2092        export_header: bool = True,
2093        query: str | None = None,
2094        parquet_partitions: list | None = None,
2095        chunk_size: int | None = None,
2096        threads: int | None = None,
2097        sort: bool = False,
2098        index: bool = False,
2099        order_by: str | None = None,
2100        fields_to_rename: dict | None = None
2101    ) -> bool:
2102        """
2103        The `export_output` function exports data from a VCF file to various formats, including VCF,
2104        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
2105        partitioning.
2106        
2107        :param output_file: The `output_file` parameter is a string that specifies the name of the
2108        output file where the exported data will be saved
2109        :type output_file: str | None
2110        :param output_header: The `output_header` parameter is a string that specifies the name of the
2111        file where the header of the VCF file will be exported. If this parameter is not provided, the
2112        header will be exported to a file with the same name as the `output_file` parameter, but with
2113        the extension "
2114        :type output_header: str | None
2115        :param export_header: The `export_header` parameter is a boolean flag that determines whether
2116        the header of a VCF file should be exported to a separate file or not. If `export_header` is
2117        True, the header will be exported to a file. If `export_header` is False, the header will not
2118        be, defaults to True
2119        :type export_header: bool (optional)
2120        :param query: The `query` parameter in the `export_output` function is an optional SQL query
2121        that can be used to filter and select specific data from the VCF file before exporting it. If
2122        provided, only the data that matches the query will be exported. This allows you to customize
2123        the exported data based on
2124        :type query: str | None
2125        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
2126        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
2127        organize data in a hierarchical directory structure based on the values of one or more columns.
2128        This can improve query performance when working with large datasets
2129        :type parquet_partitions: list | None
2130        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
2131        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
2132        multiple files. It helps in optimizing the export process by breaking down the data into
2133        manageable chunks for processing and storage
2134        :type chunk_size: int | None
2135        :param threads: The `threads` parameter in the `export_output` function specifies the number of
2136        threads to be used during the export process. It determines the level of parallelism and can
2137        improve the performance of the export operation. If this parameter is not provided, the function
2138        will use the default number of threads
2139        :type threads: int | None
2140        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
2141        determines whether the output file should be sorted based on genomic coordinates of the
2142        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
2143        `False`,, defaults to False
2144        :type sort: bool (optional)
2145        :param index: The `index` parameter in the `export_output` function is a boolean flag that
2146        determines whether an index should be created on the output file. If `index` is set to `True`,
2147        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
2148        :type index: bool (optional)
2149        :param order_by: The `order_by` parameter in the `export_output` function is a string that
2150        specifies the column(s) to use for sorting the output file. This parameter is only applicable
2151        when exporting data in VCF format. It allows you to specify the column(s) based on which the
2152        output file should be
2153        :type order_by: str | None
2154        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
2155        mapping of field names to be renamed during the export process. This parameter allows you to
2156        customize the output field names before exporting the data. Each key-value pair in the
2157        dictionary represents the original field name as the key and the new field name
2158        :type fields_to_rename: dict | None
2159        :return: The `export_output` function returns a boolean value. It checks if the output file
2160        exists and returns True if it does, or None if it doesn't.
2161        """
2162
2163        # Log
2164        log.info("Exporting...")
2165
2166        # Full path
2167        output_file = full_path(output_file)
2168        output_header = full_path(output_header)
2169
2170        # Config
2171        config = self.get_config()
2172
2173        # Param
2174        param = self.get_param()
2175
2176        # Tmp files to remove
2177        tmp_to_remove = []
2178
2179        # If no output, get it
2180        if not output_file:
2181            output_file = self.get_output()
2182
2183        # If not threads
2184        if not threads:
2185            threads = self.get_threads()
2186
2187        # Rename fields
2188        if not fields_to_rename:
2189            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
2190        self.rename_info_fields(fields_to_rename=fields_to_rename)
2191
2192        # Auto header name with extension
2193        if export_header or output_header:
2194            if not output_header:
2195                output_header = f"{output_file}.hdr"
2196            # Export header
2197            self.export_header(output_file=output_file)
2198
2199        # Switch off export header if VCF output
2200        output_file_type = get_file_format(output_file)
2201        if output_file_type in ["vcf"]:
2202            export_header = False
2203            tmp_to_remove.append(output_header)
2204
2205        # Chunk size
2206        if not chunk_size:
2207            chunk_size = config.get("chunk_size", None)
2208
2209        # Parquet partition
2210        if not parquet_partitions:
2211            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2212        if parquet_partitions and isinstance(parquet_partitions, str):
2213            parquet_partitions = parquet_partitions.split(",")
2214
2215        # Order by
2216        if not order_by:
2217            order_by = param.get("export", {}).get("order_by", "")
2218
2219        # Header in output
2220        header_in_output = param.get("export", {}).get("include_header", False)
2221
2222        # Database
2223        database_source = self.get_connexion()
2224
2225        # Connexion format
2226        connexion_format = self.get_connexion_format()
2227
2228        # Explode infos
2229        if self.get_explode_infos():
2230            self.explode_infos(
2231                prefix=self.get_explode_infos_prefix(),
2232                fields=self.get_explode_infos_fields(),
2233                force=False,
2234            )
2235
2236        # if connexion_format in ["sqlite"] or query:
2237        if connexion_format in ["sqlite"]:
2238
2239            # Export in Parquet
2240            random_tmp = "".join(
2241                random.choice(string.ascii_lowercase) for i in range(10)
2242            )
2243            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2244            tmp_to_remove.append(database_source)
2245
2246            # Table Variants
2247            table_variants = self.get_table_variants()
2248
2249            # Create export query
2250            sql_query_export_subquery = f"""
2251                SELECT * FROM {table_variants}
2252                """
2253
2254            # Write source file
2255            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2256
2257        # Create database
2258        database = Database(
2259            database=database_source,
2260            table="variants",
2261            header_file=output_header,
2262            conn_config=self.get_connexion_config(),
2263        )
2264
2265        # Existing colomns header
2266        existing_columns_header = database.get_header_columns_from_database(query=query)
2267
2268        # Sample list
2269        if output_file_type in ["vcf"]:
2270            get_samples = self.get_samples()
2271            get_samples_check = self.get_samples_check()
2272            samples_force = get_samples is not None
2273            sample_list = self.get_header_sample_list(
2274                check=get_samples_check,
2275                samples=get_samples,
2276                samples_force=samples_force,
2277            )
2278        else:
2279            sample_list = None
2280
2281        # Export file
2282        database.export(
2283            output_database=output_file,
2284            output_header=output_header,
2285            existing_columns_header=existing_columns_header,
2286            parquet_partitions=parquet_partitions,
2287            chunk_size=chunk_size,
2288            threads=threads,
2289            sort=sort,
2290            index=index,
2291            header_in_output=header_in_output,
2292            order_by=order_by,
2293            query=query,
2294            export_header=export_header,
2295            sample_list=sample_list,
2296        )
2297
2298        # Remove
2299        remove_if_exists(tmp_to_remove)
2300
2301        return (os.path.exists(output_file) or None) and (
2302            os.path.exists(output_file) or None
2303        )

The export_output function exports data from a VCF file to various formats, including VCF, CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and partitioning.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True
  • query: The query parameter in the export_output function is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported. This allows you to customize the exported data based on
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in a batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. It helps in optimizing the export process by breaking down the data into manageable chunks for processing and storage
  • threads: The threads parameter in the export_output function specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If this parameter is not provided, the function will use the default number of threads
  • sort: The sort parameter in the export_output function is a boolean flag that determines whether the output file should be sorted based on genomic coordinates of the variants. If sort is set to True, the output file will be sorted. If sort is set to False,, defaults to False
  • index: The index parameter in the export_output function is a boolean flag that determines whether an index should be created on the output file. If index is set to True, an index will be created on the output file. If index is set to False, no, defaults to False
  • order_by: The order_by parameter in the export_output function is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format. It allows you to specify the column(s) based on which the output file should be
  • fields_to_rename: The fields_to_rename parameter is a dictionary that specifies the mapping of field names to be renamed during the export process. This parameter allows you to customize the output field names before exporting the data. Each key-value pair in the dictionary represents the original field name as the key and the new field name
Returns

The export_output function returns a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2305    def get_extra_infos(self, table: str = None) -> list:
2306        """
2307        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2308        in the header.
2309
2310        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2311        name of the table from which you want to retrieve the extra columns that are not present in the
2312        header. If the `table` parameter is not provided when calling the function, it will default to
2313        using the variants
2314        :type table: str
2315        :return: A list of columns that are in the specified table but not in the header of the table.
2316        """
2317
2318        header_columns = []
2319
2320        if not table:
2321            table = self.get_table_variants(clause="from")
2322            header_columns = self.get_header_columns()
2323
2324        # Check all columns in the database
2325        query = f""" SELECT * FROM {table} LIMIT 1 """
2326        log.debug(f"query {query}")
2327        table_columns = self.get_query_to_df(query).columns.tolist()
2328        extra_columns = []
2329
2330        # Construct extra infos (not in header)
2331        for column in table_columns:
2332            if column not in header_columns:
2333                extra_columns.append(column)
2334
2335        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2337    def get_extra_infos_sql(self, table: str = None) -> str:
2338        """
2339        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2340        by double quotes
2341
2342        :param table: The name of the table to get the extra infos from. If None, the default table is
2343        used
2344        :type table: str
2345        :return: A string of the extra infos
2346        """
2347
2348        return ", ".join(
2349            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2350        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2352    def export_header(
2353        self,
2354        header_name: str = None,
2355        output_file: str = None,
2356        output_file_ext: str = ".hdr",
2357        clean_header: bool = True,
2358        remove_chrom_line: bool = False,
2359    ) -> str:
2360        """
2361        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2362        specified options, and writes it to a new file.
2363
2364        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2365        this parameter is not specified, the header will be written to the output file
2366        :type header_name: str
2367        :param output_file: The `output_file` parameter in the `export_header` function is used to
2368        specify the name of the output file where the header will be written. If this parameter is not
2369        provided, the header will be written to a temporary file
2370        :type output_file: str
2371        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2372        string that represents the extension of the output header file. By default, it is set to ".hdr"
2373        if not specified by the user. This extension will be appended to the `output_file` name to
2374        create the final, defaults to .hdr
2375        :type output_file_ext: str (optional)
2376        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2377        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2378        `True`, the function will clean the header by modifying certain lines based on a specific
2379        pattern. If `clean_header`, defaults to True
2380        :type clean_header: bool (optional)
2381        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2382        boolean flag that determines whether the #CHROM line should be removed from the header before
2383        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2384        defaults to False
2385        :type remove_chrom_line: bool (optional)
2386        :return: The function `export_header` returns the name of the temporary header file that is
2387        created.
2388        """
2389
2390        if not header_name and not output_file:
2391            output_file = self.get_output()
2392
2393        if self.get_header():
2394
2395            # Get header object
2396            header_obj = self.get_header()
2397
2398            # Create database
2399            db_for_header = Database(database=self.get_input())
2400
2401            # Get real columns in the file
2402            db_header_columns = db_for_header.get_columns()
2403
2404            with tempfile.TemporaryDirectory() as tmpdir:
2405
2406                # Write header file
2407                header_file_tmp = os.path.join(tmpdir, "header")
2408                f = open(header_file_tmp, "w")
2409                vcf.Writer(f, header_obj)
2410                f.close()
2411
2412                # Replace #CHROM line with rel columns
2413                header_list = db_for_header.read_header_file(
2414                    header_file=header_file_tmp
2415                )
2416                header_list[-1] = "\t".join(db_header_columns)
2417
2418                # Remove CHROM line
2419                if remove_chrom_line:
2420                    header_list.pop()
2421
2422                # Clean header
2423                if clean_header:
2424                    header_list_clean = []
2425                    for head in header_list:
2426                        # Clean head for malformed header
2427                        head_clean = head
2428                        head_clean = re.subn(
2429                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2430                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2431                            head_clean,
2432                            2,
2433                        )[0]
2434                        # Write header
2435                        header_list_clean.append(head_clean)
2436                    header_list = header_list_clean
2437
2438            tmp_header_name = output_file + output_file_ext
2439
2440            f = open(tmp_header_name, "w")
2441            for line in header_list:
2442                f.write(line)
2443            f.close()
2444
2445        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2447    def export_variant_vcf(
2448        self,
2449        vcf_file,
2450        remove_info: bool = False,
2451        add_samples: bool = True,
2452        list_samples: list = [],
2453        where_clause: str = "",
2454        index: bool = False,
2455        threads: int | None = None,
2456    ) -> bool | None:
2457        """
2458        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2459        remove INFO field, add samples, and control compression and indexing.
2460
2461        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2462        written to. It is the output file that will contain the filtered VCF data based on the specified
2463        parameters
2464        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2465        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2466        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2467        in, defaults to False
2468        :type remove_info: bool (optional)
2469        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2470        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2471        If set to False, the samples will be removed. The default value is True, defaults to True
2472        :type add_samples: bool (optional)
2473        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2474        in the output VCF file. By default, all samples will be included. If you provide a list of
2475        samples, only those samples will be included in the output file
2476        :type list_samples: list
2477        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2478        determines whether or not to create an index for the output VCF file. If `index` is set to
2479        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2480        :type index: bool (optional)
2481        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2482        number of threads to use for exporting the VCF file. It determines how many parallel threads
2483        will be used during the export process. More threads can potentially speed up the export process
2484        by utilizing multiple cores of the processor. If
2485        :type threads: int | None
2486        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2487        method with various parameters including the output file, query, threads, sort flag, and index
2488        flag. The `export_output` method is responsible for exporting the VCF data based on the
2489        specified parameters and configurations provided in the `export_variant_vcf` function.
2490        """
2491
2492        # Config
2493        config = self.get_config()
2494
2495        # Extract VCF
2496        log.debug("Export VCF...")
2497
2498        # Table variants
2499        table_variants = self.get_table_variants()
2500
2501        # Threads
2502        if not threads:
2503            threads = self.get_threads()
2504
2505        # Info fields
2506        if remove_info:
2507            if not isinstance(remove_info, str):
2508                remove_info = "."
2509            info_field = f"""'{remove_info}' as INFO"""
2510        else:
2511            info_field = "INFO"
2512
2513        # Samples fields
2514        if add_samples:
2515            if not list_samples:
2516                list_samples = self.get_header_sample_list()
2517            if list_samples:
2518                samples_fields = " , FORMAT , " + " , ".join(
2519                    [f""" "{sample}" """ for sample in list_samples]
2520                )
2521            else:
2522                samples_fields = ""
2523            log.debug(f"samples_fields: {samples_fields}")
2524        else:
2525            samples_fields = ""
2526
2527        # Where clause
2528        if where_clause is None:
2529            where_clause = ""
2530
2531        # Variants
2532        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2533        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2534        log.debug(f"sql_query_select={sql_query_select}")
2535
2536        return self.export_output(
2537            output_file=vcf_file,
2538            output_header=None,
2539            export_header=True,
2540            query=sql_query_select,
2541            parquet_partitions=None,
2542            chunk_size=config.get("chunk_size", None),
2543            threads=threads,
2544            sort=True,
2545            index=index,
2546            order_by=None,
2547        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2549    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2550        """
2551        It takes a list of commands and runs them in parallel using the number of threads specified
2552
2553        :param commands: A list of commands to run
2554        :param threads: The number of threads to use, defaults to 1 (optional)
2555        """
2556
2557        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2559    def get_threads(self, default: int = 1) -> int:
2560        """
2561        This function returns the number of threads to use for a job, with a default value of 1 if not
2562        specified.
2563
2564        :param default: The `default` parameter in the `get_threads` method is used to specify the
2565        default number of threads to use if no specific value is provided. If no value is provided for
2566        the `threads` parameter in the configuration or input parameters, the `default` value will be
2567        used, defaults to 1
2568        :type default: int (optional)
2569        :return: the number of threads to use for the current job.
2570        """
2571
2572        # Config
2573        config = self.get_config()
2574
2575        # Param
2576        param = self.get_param()
2577
2578        # Input threads
2579        input_thread = param.get("threads", config.get("threads", None))
2580
2581        # Check threads
2582        if not input_thread:
2583            threads = default
2584        elif int(input_thread) <= 0:
2585            threads = os.cpu_count()
2586        else:
2587            threads = int(input_thread)
2588        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2590    def get_memory(self, default: str = None) -> str:
2591        """
2592        This function retrieves the memory value from parameters or configuration with a default value
2593        if not found.
2594
2595        :param default: The `get_memory` function takes in a default value as a string parameter. This
2596        default value is used as a fallback in case the `memory` parameter is not provided in the
2597        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2598        the function
2599        :type default: str
2600        :return: The `get_memory` function returns a string value representing the memory parameter. If
2601        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2602        return the default value provided as an argument to the function.
2603        """
2604
2605        # Config
2606        config = self.get_config()
2607
2608        # Param
2609        param = self.get_param()
2610
2611        # Input threads
2612        input_memory = param.get("memory", config.get("memory", None))
2613
2614        # Check threads
2615        if input_memory:
2616            memory = input_memory
2617        else:
2618            memory = default
2619
2620        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2622    def update_from_vcf(self, vcf_file: str) -> None:
2623        """
2624        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2625
2626        :param vcf_file: the path to the VCF file
2627        """
2628
2629        connexion_format = self.get_connexion_format()
2630
2631        if connexion_format in ["duckdb"]:
2632            self.update_from_vcf_duckdb(vcf_file)
2633        elif connexion_format in ["sqlite"]:
2634            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2636    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2637        """
2638        It takes a VCF file and updates the INFO column of the variants table in the database with the
2639        INFO column of the VCF file
2640
2641        :param vcf_file: the path to the VCF file
2642        """
2643
2644        # varaints table
2645        table_variants = self.get_table_variants()
2646
2647        # Loading VCF into temporaire table
2648        skip = self.get_header_length(file=vcf_file)
2649        vcf_df = pd.read_csv(
2650            vcf_file,
2651            sep="\t",
2652            engine="c",
2653            skiprows=skip,
2654            header=0,
2655            low_memory=False,
2656        )
2657        sql_query_update = f"""
2658        UPDATE {table_variants} as table_variants
2659            SET INFO = concat(
2660                            CASE
2661                                WHEN INFO NOT IN ('', '.')
2662                                THEN INFO
2663                                ELSE ''
2664                            END,
2665                            (
2666                                SELECT 
2667                                    concat(
2668                                        CASE
2669                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2670                                            THEN ';'
2671                                            ELSE ''
2672                                        END
2673                                        ,
2674                                        CASE
2675                                            WHEN table_parquet.INFO NOT IN ('','.')
2676                                            THEN table_parquet.INFO
2677                                            ELSE ''
2678                                        END
2679                                    )
2680                                FROM vcf_df as table_parquet
2681                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2682                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2683                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2684                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2685                                        AND table_parquet.INFO NOT IN ('','.')
2686                            )
2687                        )
2688            ;
2689            """
2690        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2692    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2693        """
2694        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2695        table, then updates the INFO column of the variants table with the INFO column of the temporary
2696        table
2697
2698        :param vcf_file: The path to the VCF file you want to update the database with
2699        """
2700
2701        # Create a temporary table for the VCF
2702        table_vcf = "tmp_vcf"
2703        sql_create = (
2704            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2705        )
2706        self.conn.execute(sql_create)
2707
2708        # Loading VCF into temporaire table
2709        vcf_df = pd.read_csv(
2710            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2711        )
2712        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2713        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2714
2715        # Update table 'variants' with VCF data
2716        # warning: CONCAT as || operator
2717        sql_query_update = f"""
2718            UPDATE variants as table_variants
2719            SET INFO = CASE
2720                            WHEN INFO NOT IN ('', '.')
2721                            THEN INFO
2722                            ELSE ''
2723                        END ||
2724                        (
2725                        SELECT 
2726                            CASE 
2727                                WHEN table_variants.INFO NOT IN ('','.') 
2728                                    AND table_vcf.INFO NOT IN ('','.')  
2729                                THEN ';' 
2730                                ELSE '' 
2731                            END || 
2732                            CASE 
2733                                WHEN table_vcf.INFO NOT IN ('','.') 
2734                                THEN table_vcf.INFO 
2735                                ELSE '' 
2736                            END
2737                        FROM {table_vcf} as table_vcf
2738                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2739                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2740                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2741                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2742                        )
2743        """
2744        self.conn.execute(sql_query_update)
2745
2746        # Drop temporary table
2747        sql_drop = f"DROP TABLE {table_vcf}"
2748        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2750    def drop_variants_table(self) -> None:
2751        """
2752        > This function drops the variants table
2753        """
2754
2755        table_variants = self.get_table_variants()
2756        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2757        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2759    def set_variant_id(
2760        self, variant_id_column: str = "variant_id", force: bool = None
2761    ) -> str:
2762        """
2763        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2764        `#CHROM`, `POS`, `REF`, and `ALT` columns
2765
2766        :param variant_id_column: The name of the column to be created in the variants table, defaults
2767        to variant_id
2768        :type variant_id_column: str (optional)
2769        :param force: If True, the variant_id column will be created even if it already exists
2770        :type force: bool
2771        :return: The name of the column that contains the variant_id
2772        """
2773
2774        # Assembly
2775        assembly = self.get_param().get(
2776            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2777        )
2778
2779        # INFO/Tag prefix
2780        prefix = self.get_explode_infos_prefix()
2781
2782        # Explode INFO/SVTYPE
2783        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2784
2785        # variants table
2786        table_variants = self.get_table_variants()
2787
2788        # variant_id column
2789        if not variant_id_column:
2790            variant_id_column = "variant_id"
2791
2792        # Creta variant_id column
2793        if "variant_id" not in self.get_extra_infos() or force:
2794
2795            # Create column
2796            self.add_column(
2797                table_name=table_variants,
2798                column_name=variant_id_column,
2799                column_type="UBIGINT",
2800                default_value="0",
2801            )
2802
2803            # Update column
2804            self.conn.execute(
2805                f"""
2806                    UPDATE {table_variants}
2807                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2808                """
2809            )
2810
2811        # Remove added columns
2812        for added_column in added_columns:
2813            self.drop_column(column=added_column)
2814
2815        # return variant_id column name
2816        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2818    def get_variant_id_column(
2819        self, variant_id_column: str = "variant_id", force: bool = None
2820    ) -> str:
2821        """
2822        This function returns the variant_id column name
2823
2824        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2825        defaults to variant_id
2826        :type variant_id_column: str (optional)
2827        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2828        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2829        if it is not already set, or if it is set
2830        :type force: bool
2831        :return: The variant_id column name.
2832        """
2833
2834        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2840    def scan_databases(
2841        self,
2842        database_formats: list = ["parquet"],
2843        database_releases: list = ["current"],
2844    ) -> dict:
2845        """
2846        The function `scan_databases` scans for available databases based on specified formats and
2847        releases.
2848
2849        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2850        of the databases to be scanned. In this case, the accepted format is "parquet"
2851        :type database_formats: list ["parquet"]
2852        :param database_releases: The `database_releases` parameter is a list that specifies the
2853        releases of the databases to be scanned. In the provided function, the default value for
2854        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2855        databases that are in the "current"
2856        :type database_releases: list
2857        :return: The function `scan_databases` returns a dictionary containing information about
2858        databases that match the specified formats and releases.
2859        """
2860
2861        # Config
2862        config = self.get_config()
2863
2864        # Param
2865        param = self.get_param()
2866
2867        # Param - Assembly
2868        assembly = param.get("assembly", config.get("assembly", None))
2869        if not assembly:
2870            assembly = DEFAULT_ASSEMBLY
2871            log.warning(f"Default assembly '{assembly}'")
2872
2873        # Scan for availabled databases
2874        log.info(
2875            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2876        )
2877        databases_infos_dict = databases_infos(
2878            database_folder_releases=database_releases,
2879            database_formats=database_formats,
2880            assembly=assembly,
2881            config=config,
2882        )
2883        log.info(
2884            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2885        )
2886
2887        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2889    def annotation(self) -> None:
2890        """
2891        It annotates the VCF file with the annotations specified in the config file.
2892        """
2893
2894        # Config
2895        config = self.get_config()
2896
2897        # Param
2898        param = self.get_param()
2899
2900        # Param - Assembly
2901        assembly = param.get("assembly", config.get("assembly", None))
2902        if not assembly:
2903            assembly = DEFAULT_ASSEMBLY
2904            log.warning(f"Default assembly '{assembly}'")
2905
2906        # annotations databases folders
2907        annotations_databases = set(
2908            config.get("folders", {})
2909            .get("databases", {})
2910            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2911            + config.get("folders", {})
2912            .get("databases", {})
2913            .get("parquet", ["~/howard/databases/parquet/current"])
2914            + config.get("folders", {})
2915            .get("databases", {})
2916            .get("bcftools", ["~/howard/databases/bcftools/current"])
2917        )
2918
2919        # Get param annotations
2920        if param.get("annotations", None) and isinstance(
2921            param.get("annotations", None), str
2922        ):
2923            log.debug(param.get("annotations", None))
2924            param_annotation_list = param.get("annotations").split(",")
2925        else:
2926            param_annotation_list = []
2927
2928        # Each tools param
2929        if param.get("annotation_parquet", None) != None:
2930            log.debug(
2931                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2932            )
2933            if isinstance(param.get("annotation_parquet", None), list):
2934                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2935            else:
2936                param_annotation_list.append(param.get("annotation_parquet"))
2937        if param.get("annotation_snpsift", None) != None:
2938            if isinstance(param.get("annotation_snpsift", None), list):
2939                param_annotation_list.append(
2940                    "snpsift:"
2941                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2942                )
2943            else:
2944                param_annotation_list.append(
2945                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2946                )
2947        if param.get("annotation_snpeff", None) != None:
2948            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2949        if param.get("annotation_bcftools", None) != None:
2950            if isinstance(param.get("annotation_bcftools", None), list):
2951                param_annotation_list.append(
2952                    "bcftools:"
2953                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2954                )
2955            else:
2956                param_annotation_list.append(
2957                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2958                )
2959        if param.get("annotation_annovar", None) != None:
2960            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2961        if param.get("annotation_exomiser", None) != None:
2962            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2963        if param.get("annotation_splice", None) != None:
2964            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2965
2966        # Merge param annotations list
2967        param["annotations"] = ",".join(param_annotation_list)
2968
2969        # debug
2970        log.debug(f"param_annotations={param['annotations']}")
2971
2972        if param.get("annotations"):
2973
2974            # Log
2975            # log.info("Annotations - Check annotation parameters")
2976
2977            if not "annotation" in param:
2978                param["annotation"] = {}
2979
2980            # List of annotations parameters
2981            annotations_list_input = {}
2982            if isinstance(param.get("annotations", None), str):
2983                annotation_file_list = [
2984                    value for value in param.get("annotations", "").split(",")
2985                ]
2986                for annotation_file in annotation_file_list:
2987                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
2988            else:
2989                annotations_list_input = param.get("annotations", {})
2990
2991            log.info(f"Quick Annotations:")
2992            for annotation_key in list(annotations_list_input.keys()):
2993                log.info(f"   {annotation_key}")
2994
2995            # List of annotations and associated fields
2996            annotations_list = {}
2997
2998            for annotation_file in annotations_list_input:
2999
3000                # Explode annotations if ALL
3001                if (
3002                    annotation_file.upper() == "ALL"
3003                    or annotation_file.upper().startswith("ALL:")
3004                ):
3005
3006                    # check ALL parameters (formats, releases)
3007                    annotation_file_split = annotation_file.split(":")
3008                    database_formats = "parquet"
3009                    database_releases = "current"
3010                    for annotation_file_option in annotation_file_split[1:]:
3011                        database_all_options_split = annotation_file_option.split("=")
3012                        if database_all_options_split[0] == "format":
3013                            database_formats = database_all_options_split[1].split("+")
3014                        if database_all_options_split[0] == "release":
3015                            database_releases = database_all_options_split[1].split("+")
3016
3017                    # Scan for availabled databases
3018                    databases_infos_dict = self.scan_databases(
3019                        database_formats=database_formats,
3020                        database_releases=database_releases,
3021                    )
3022
3023                    # Add found databases in annotation parameters
3024                    for database_infos in databases_infos_dict.keys():
3025                        annotations_list[database_infos] = {"INFO": None}
3026
3027                else:
3028                    annotations_list[annotation_file] = annotations_list_input[
3029                        annotation_file
3030                    ]
3031
3032            # Check each databases
3033            if len(annotations_list):
3034
3035                log.info(
3036                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
3037                )
3038
3039                for annotation_file in annotations_list:
3040
3041                    # Init
3042                    annotations = annotations_list.get(annotation_file, None)
3043
3044                    # Annotation snpEff
3045                    if annotation_file.startswith("snpeff"):
3046
3047                        log.debug(f"Quick Annotation snpEff")
3048
3049                        if "snpeff" not in param["annotation"]:
3050                            param["annotation"]["snpeff"] = {}
3051
3052                        if "options" not in param["annotation"]["snpeff"]:
3053                            param["annotation"]["snpeff"]["options"] = ""
3054
3055                        # snpEff options in annotations
3056                        param["annotation"]["snpeff"]["options"] = "".join(
3057                            annotation_file.split(":")[1:]
3058                        )
3059
3060                    # Annotation Annovar
3061                    elif annotation_file.startswith("annovar"):
3062
3063                        log.debug(f"Quick Annotation Annovar")
3064
3065                        if "annovar" not in param["annotation"]:
3066                            param["annotation"]["annovar"] = {}
3067
3068                        if "annotations" not in param["annotation"]["annovar"]:
3069                            param["annotation"]["annovar"]["annotations"] = {}
3070
3071                        # Options
3072                        annotation_file_split = annotation_file.split(":")
3073                        for annotation_file_annotation in annotation_file_split[1:]:
3074                            if annotation_file_annotation:
3075                                param["annotation"]["annovar"]["annotations"][
3076                                    annotation_file_annotation
3077                                ] = annotations
3078
3079                    # Annotation Exomiser
3080                    elif annotation_file.startswith("exomiser"):
3081
3082                        log.debug(f"Quick Annotation Exomiser")
3083
3084                        param["annotation"]["exomiser"] = params_string_to_dict(
3085                            annotation_file
3086                        )
3087
3088                    # Annotation Splice
3089                    elif annotation_file.startswith("splice"):
3090
3091                        log.debug(f"Quick Annotation Splice")
3092
3093                        param["annotation"]["splice"] = params_string_to_dict(
3094                            annotation_file
3095                        )
3096
3097                    # Annotation Parquet or BCFTOOLS
3098                    else:
3099
3100                        # Tools detection
3101                        if annotation_file.startswith("bcftools:"):
3102                            annotation_tool_initial = "bcftools"
3103                            annotation_file = ":".join(annotation_file.split(":")[1:])
3104                        elif annotation_file.startswith("snpsift:"):
3105                            annotation_tool_initial = "snpsift"
3106                            annotation_file = ":".join(annotation_file.split(":")[1:])
3107                        elif annotation_file.startswith("bigwig:"):
3108                            annotation_tool_initial = "bigwig"
3109                            annotation_file = ":".join(annotation_file.split(":")[1:])
3110                        else:
3111                            annotation_tool_initial = None
3112
3113                        # list of files
3114                        annotation_file_list = annotation_file.replace("+", ":").split(
3115                            ":"
3116                        )
3117
3118                        for annotation_file in annotation_file_list:
3119
3120                            if annotation_file:
3121
3122                                # Annotation tool initial
3123                                annotation_tool = annotation_tool_initial
3124
3125                                # Find file
3126                                annotation_file_found = None
3127
3128                                if os.path.exists(annotation_file):
3129                                    annotation_file_found = annotation_file
3130                                elif os.path.exists(full_path(annotation_file)):
3131                                    annotation_file_found = full_path(annotation_file)
3132                                else:
3133                                    # Find within assembly folders
3134                                    for annotations_database in annotations_databases:
3135                                        found_files = find_all(
3136                                            annotation_file,
3137                                            os.path.join(
3138                                                annotations_database, assembly
3139                                            ),
3140                                        )
3141                                        if len(found_files) > 0:
3142                                            annotation_file_found = found_files[0]
3143                                            break
3144                                    if not annotation_file_found and not assembly:
3145                                        # Find within folders
3146                                        for (
3147                                            annotations_database
3148                                        ) in annotations_databases:
3149                                            found_files = find_all(
3150                                                annotation_file, annotations_database
3151                                            )
3152                                            if len(found_files) > 0:
3153                                                annotation_file_found = found_files[0]
3154                                                break
3155                                log.debug(
3156                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
3157                                )
3158
3159                                # Full path
3160                                annotation_file_found = full_path(annotation_file_found)
3161
3162                                if annotation_file_found:
3163
3164                                    database = Database(database=annotation_file_found)
3165                                    quick_annotation_format = database.get_format()
3166                                    quick_annotation_is_compressed = (
3167                                        database.is_compressed()
3168                                    )
3169                                    quick_annotation_is_indexed = os.path.exists(
3170                                        f"{annotation_file_found}.tbi"
3171                                    )
3172                                    bcftools_preference = False
3173
3174                                    # Check Annotation Tool
3175                                    if not annotation_tool:
3176                                        if (
3177                                            bcftools_preference
3178                                            and quick_annotation_format
3179                                            in ["vcf", "bed"]
3180                                            and quick_annotation_is_compressed
3181                                            and quick_annotation_is_indexed
3182                                        ):
3183                                            annotation_tool = "bcftools"
3184                                        elif quick_annotation_format in [
3185                                            "vcf",
3186                                            "bed",
3187                                            "tsv",
3188                                            "tsv",
3189                                            "csv",
3190                                            "json",
3191                                            "tbl",
3192                                            "parquet",
3193                                            "duckdb",
3194                                        ]:
3195                                            annotation_tool = "parquet"
3196                                        elif quick_annotation_format in ["bw"]:
3197                                            annotation_tool = "bigwig"
3198                                        else:
3199                                            log.error(
3200                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3201                                            )
3202                                            raise ValueError(
3203                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3204                                            )
3205
3206                                    log.debug(
3207                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3208                                    )
3209
3210                                    # Annotation Tool dispatch
3211                                    if annotation_tool:
3212                                        if annotation_tool not in param["annotation"]:
3213                                            param["annotation"][annotation_tool] = {}
3214                                        if (
3215                                            "annotations"
3216                                            not in param["annotation"][annotation_tool]
3217                                        ):
3218                                            param["annotation"][annotation_tool][
3219                                                "annotations"
3220                                            ] = {}
3221                                        param["annotation"][annotation_tool][
3222                                            "annotations"
3223                                        ][annotation_file_found] = annotations
3224
3225                                else:
3226                                    log.warning(
3227                                        f"Quick Annotation File {annotation_file} does NOT exist"
3228                                    )
3229
3230                self.set_param(param)
3231
3232        if param.get("annotation", None):
3233            log.info("Annotations")
3234            if param.get("annotation", {}).get("parquet", None):
3235                log.info("Annotations 'parquet'...")
3236                self.annotation_parquet()
3237            if param.get("annotation", {}).get("bcftools", None):
3238                log.info("Annotations 'bcftools'...")
3239                self.annotation_bcftools()
3240            if param.get("annotation", {}).get("snpsift", None):
3241                log.info("Annotations 'snpsift'...")
3242                self.annotation_snpsift()
3243            if param.get("annotation", {}).get("bigwig", None):
3244                log.info("Annotations 'bigwig'...")
3245                self.annotation_bigwig()
3246            if param.get("annotation", {}).get("annovar", None):
3247                log.info("Annotations 'annovar'...")
3248                self.annotation_annovar()
3249            if param.get("annotation", {}).get("snpeff", None):
3250                log.info("Annotations 'snpeff'...")
3251                self.annotation_snpeff()
3252            if param.get("annotation", {}).get("exomiser", None) is not None:
3253                log.info("Annotations 'exomiser'...")
3254                self.annotation_exomiser()
3255            if param.get("annotation", {}).get("splice", None) is not None:
3256                log.info("Annotations 'splice' ...")
3257                self.annotation_splice()
3258
3259        # Explode INFOS fields into table fields
3260        if self.get_explode_infos():
3261            self.explode_infos(
3262                prefix=self.get_explode_infos_prefix(),
3263                fields=self.get_explode_infos_fields(),
3264                force=True,
3265            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_bigwig(self, threads: int = None) -> None:
3267    def annotation_bigwig(self, threads: int = None) -> None:
3268        """
3269        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
3270
3271        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
3272        number of threads to be used for parallel processing during the annotation process. If the
3273        `threads` parameter is not provided, the method will attempt to determine the optimal number of
3274        threads to use based on the system configuration
3275        :type threads: int
3276        :return: True
3277        """
3278
3279        # DEBUG
3280        log.debug("Start annotation with bigwig databases")
3281
3282        # # Threads
3283        # if not threads:
3284        #     threads = self.get_threads()
3285        # log.debug("Threads: " + str(threads))
3286
3287        # Config
3288        config = self.get_config()
3289        log.debug("Config: " + str(config))
3290
3291        # Config - BCFTools databases folders
3292        databases_folders = set(
3293            self.get_config()
3294            .get("folders", {})
3295            .get("databases", {})
3296            .get("annotations", ["."])
3297            + self.get_config()
3298            .get("folders", {})
3299            .get("databases", {})
3300            .get("bigwig", ["."])
3301        )
3302        log.debug("Databases annotations: " + str(databases_folders))
3303
3304        # Param
3305        annotations = (
3306            self.get_param()
3307            .get("annotation", {})
3308            .get("bigwig", {})
3309            .get("annotations", None)
3310        )
3311        log.debug("Annotations: " + str(annotations))
3312
3313        # Assembly
3314        assembly = self.get_param().get(
3315            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3316        )
3317
3318        # Data
3319        table_variants = self.get_table_variants()
3320
3321        # Check if not empty
3322        log.debug("Check if not empty")
3323        sql_query_chromosomes = (
3324            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3325        )
3326        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3327        if not sql_query_chromosomes_df["count"][0]:
3328            log.info(f"VCF empty")
3329            return
3330
3331        # VCF header
3332        vcf_reader = self.get_header()
3333        log.debug("Initial header: " + str(vcf_reader.infos))
3334
3335        # Existing annotations
3336        for vcf_annotation in self.get_header().infos:
3337
3338            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3339            log.debug(
3340                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3341            )
3342
3343        if annotations:
3344
3345            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3346
3347                # Export VCF file
3348                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3349
3350                # annotation_bigwig_config
3351                annotation_bigwig_config_list = []
3352
3353                for annotation in annotations:
3354                    annotation_fields = annotations[annotation]
3355
3356                    # Annotation Name
3357                    annotation_name = os.path.basename(annotation)
3358
3359                    if not annotation_fields:
3360                        annotation_fields = {"INFO": None}
3361
3362                    log.debug(f"Annotation '{annotation_name}'")
3363                    log.debug(
3364                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3365                    )
3366
3367                    # Create Database
3368                    database = Database(
3369                        database=annotation,
3370                        databases_folders=databases_folders,
3371                        assembly=assembly,
3372                    )
3373
3374                    # Find files
3375                    db_file = database.get_database()
3376                    db_file = full_path(db_file)
3377                    db_hdr_file = database.get_header_file()
3378                    db_hdr_file = full_path(db_hdr_file)
3379                    db_file_type = database.get_format()
3380
3381                    # If db_file is http ?
3382                    if database.get_database().startswith("http"):
3383
3384                        # Datbase is HTTP URL
3385                        db_file_is_http = True
3386
3387                        # DB file keep as URL
3388                        db_file = database.get_database()
3389                        log.warning(
3390                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
3391                        )
3392
3393                        # Retrieve automatic annotation field name
3394                        annotation_field = clean_annotation_field(
3395                            os.path.basename(db_file).replace(".bw", "")
3396                        )
3397                        log.debug(
3398                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
3399                        )
3400
3401                        # Create automatic header file
3402                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
3403                        with open(db_hdr_file, "w") as f:
3404                            f.write("##fileformat=VCFv4.2\n")
3405                            f.write(
3406                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
3407                            )
3408                            f.write(f"#CHROM	START	END	{annotation_field}\n")
3409
3410                    else:
3411
3412                        # Datbase is NOT HTTP URL
3413                        db_file_is_http = False
3414
3415                    # Check index - try to create if not exists
3416                    if (
3417                        db_file is None
3418                        or db_hdr_file is None
3419                        or (not os.path.exists(db_file) and not db_file_is_http)
3420                        or not os.path.exists(db_hdr_file)
3421                        or not db_file_type in ["bw"]
3422                    ):
3423                        # if False:
3424                        log.error("Annotation failed: database not valid")
3425                        log.error(f"Annotation annotation file: {db_file}")
3426                        log.error(f"Annotation annotation file type: {db_file_type}")
3427                        log.error(f"Annotation annotation header: {db_hdr_file}")
3428                        raise ValueError(
3429                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
3430                        )
3431                    else:
3432
3433                        # Log
3434                        log.debug(
3435                            f"Annotation '{annotation}' - file: "
3436                            + str(db_file)
3437                            + " and "
3438                            + str(db_hdr_file)
3439                        )
3440
3441                        # Load header as VCF object
3442                        db_hdr_vcf = Variants(input=db_hdr_file)
3443                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3444                        log.debug(
3445                            "Annotation database header: "
3446                            + str(db_hdr_vcf_header_infos)
3447                        )
3448
3449                        # For all fields in database
3450                        annotation_fields_full = False
3451                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3452                            annotation_fields = {
3453                                key: key for key in db_hdr_vcf_header_infos
3454                            }
3455                            log.debug(
3456                                "Annotation database header - All annotations added: "
3457                                + str(annotation_fields)
3458                            )
3459                            annotation_fields_full = True
3460
3461                        # Init
3462                        cyvcf2_header_rename_dict = {}
3463                        cyvcf2_header_list = []
3464                        cyvcf2_header_indexes = {}
3465
3466                        # process annotation fields
3467                        for annotation_field in annotation_fields:
3468
3469                            # New annotation name
3470                            annotation_field_new = annotation_fields[annotation_field]
3471
3472                            # Check annotation field and index in header
3473                            if (
3474                                annotation_field
3475                                in db_hdr_vcf.get_header_columns_as_list()
3476                            ):
3477                                annotation_field_index = (
3478                                    db_hdr_vcf.get_header_columns_as_list().index(
3479                                        annotation_field
3480                                    )
3481                                    - 3
3482                                )
3483                                cyvcf2_header_indexes[annotation_field_new] = (
3484                                    annotation_field_index
3485                                )
3486                            else:
3487                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
3488                                log.error(msg_err)
3489                                raise ValueError(msg_err)
3490
3491                            # Append annotation field in cyvcf2 header list
3492                            cyvcf2_header_rename_dict[annotation_field_new] = (
3493                                db_hdr_vcf_header_infos[annotation_field].id
3494                            )
3495                            cyvcf2_header_list.append(
3496                                {
3497                                    "ID": annotation_field_new,
3498                                    "Number": db_hdr_vcf_header_infos[
3499                                        annotation_field
3500                                    ].num,
3501                                    "Type": db_hdr_vcf_header_infos[
3502                                        annotation_field
3503                                    ].type,
3504                                    "Description": db_hdr_vcf_header_infos[
3505                                        annotation_field
3506                                    ].desc,
3507                                }
3508                            )
3509
3510                            # Add header on VCF
3511                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
3512                                annotation_field_new,
3513                                db_hdr_vcf_header_infos[annotation_field].num,
3514                                db_hdr_vcf_header_infos[annotation_field].type,
3515                                db_hdr_vcf_header_infos[annotation_field].desc,
3516                                "HOWARD BigWig annotation",
3517                                "unknown",
3518                                self.code_type_map[
3519                                    db_hdr_vcf_header_infos[annotation_field].type
3520                                ],
3521                            )
3522
3523                        # Load bigwig database
3524                        bw_db = pyBigWig.open(db_file)
3525                        if bw_db.isBigWig():
3526                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
3527                        else:
3528                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
3529                            log.error(msg_err)
3530                            raise ValueError(msg_err)
3531
3532                        annotation_bigwig_config_list.append(
3533                            {
3534                                "db_file": db_file,
3535                                "bw_db": bw_db,
3536                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
3537                                "cyvcf2_header_list": cyvcf2_header_list,
3538                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
3539                            }
3540                        )
3541
3542                # Annotate
3543                if annotation_bigwig_config_list:
3544
3545                    # Annotation config
3546                    log.debug(
3547                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
3548                    )
3549
3550                    # Export VCF file
3551                    self.export_variant_vcf(
3552                        vcf_file=tmp_vcf_name,
3553                        remove_info=True,
3554                        add_samples=False,
3555                        index=True,
3556                    )
3557
3558                    # Load input tmp file
3559                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
3560
3561                    # Add header in input file
3562                    for annotation_bigwig_config in annotation_bigwig_config_list:
3563                        for cyvcf2_header_field in annotation_bigwig_config.get(
3564                            "cyvcf2_header_list", []
3565                        ):
3566                            log.info(
3567                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
3568                            )
3569                            input_vcf.add_info_to_header(cyvcf2_header_field)
3570
3571                    # Create output VCF file
3572                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
3573                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
3574
3575                    # Fetch variants
3576                    log.info(f"Annotations 'bigwig' start...")
3577                    for variant in input_vcf:
3578
3579                        for annotation_bigwig_config in annotation_bigwig_config_list:
3580
3581                            # DB and indexes
3582                            bw_db = annotation_bigwig_config.get("bw_db", None)
3583                            cyvcf2_header_indexes = annotation_bigwig_config.get(
3584                                "cyvcf2_header_indexes", None
3585                            )
3586
3587                            # Retrieve value from chrom pos
3588                            res = bw_db.values(
3589                                variant.CHROM, variant.POS - 1, variant.POS
3590                            )
3591
3592                            # For each annotation fields (and indexes)
3593                            for cyvcf2_header_index in cyvcf2_header_indexes:
3594
3595                                # If value is NOT nNone
3596                                if not np.isnan(
3597                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
3598                                ):
3599                                    variant.INFO[cyvcf2_header_index] = res[
3600                                        cyvcf2_header_indexes[cyvcf2_header_index]
3601                                    ]
3602
3603                        # Add record in output file
3604                        output_vcf.write_record(variant)
3605
3606                    # Log
3607                    log.debug(f"Annotation done.")
3608
3609                    # Close and write file
3610                    log.info(f"Annotations 'bigwig' write...")
3611                    output_vcf.close()
3612                    log.debug(f"Write done.")
3613
3614                    # Update variants
3615                    log.info(f"Annotations 'bigwig' update...")
3616                    self.update_from_vcf(output_vcf_file)
3617                    log.debug(f"Update done.")
3618
3619        return True

The function annotation_bigwig annotates variants in a VCF file using bigwig databases.

Parameters
  • threads: The threads parameter in the annotation_bigwig method is used to specify the number of threads to be used for parallel processing during the annotation process. If the threads parameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns

True

def annotation_snpsift(self, threads: int = None) -> None:
3621    def annotation_snpsift(self, threads: int = None) -> None:
3622        """
3623        This function annotate with bcftools
3624
3625        :param threads: Number of threads to use
3626        :return: the value of the variable "return_value".
3627        """
3628
3629        # DEBUG
3630        log.debug("Start annotation with bcftools databases")
3631
3632        # Threads
3633        if not threads:
3634            threads = self.get_threads()
3635        log.debug("Threads: " + str(threads))
3636
3637        # Config
3638        config = self.get_config()
3639        log.debug("Config: " + str(config))
3640
3641        # Config - snpSift
3642        snpsift_bin_command = get_bin_command(
3643            bin="SnpSift.jar",
3644            tool="snpsift",
3645            bin_type="jar",
3646            config=config,
3647            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3648        )
3649        if not snpsift_bin_command:
3650            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3651            log.error(msg_err)
3652            raise ValueError(msg_err)
3653
3654        # Config - bcftools
3655        bcftools_bin_command = get_bin_command(
3656            bin="bcftools",
3657            tool="bcftools",
3658            bin_type="bin",
3659            config=config,
3660            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3661        )
3662        if not bcftools_bin_command:
3663            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3664            log.error(msg_err)
3665            raise ValueError(msg_err)
3666
3667        # Config - BCFTools databases folders
3668        databases_folders = set(
3669            self.get_config()
3670            .get("folders", {})
3671            .get("databases", {})
3672            .get("annotations", ["."])
3673            + self.get_config()
3674            .get("folders", {})
3675            .get("databases", {})
3676            .get("bcftools", ["."])
3677        )
3678        log.debug("Databases annotations: " + str(databases_folders))
3679
3680        # Param
3681        annotations = (
3682            self.get_param()
3683            .get("annotation", {})
3684            .get("snpsift", {})
3685            .get("annotations", None)
3686        )
3687        log.debug("Annotations: " + str(annotations))
3688
3689        # Assembly
3690        assembly = self.get_param().get(
3691            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3692        )
3693
3694        # Data
3695        table_variants = self.get_table_variants()
3696
3697        # Check if not empty
3698        log.debug("Check if not empty")
3699        sql_query_chromosomes = (
3700            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3701        )
3702        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3703        if not sql_query_chromosomes_df["count"][0]:
3704            log.info(f"VCF empty")
3705            return
3706
3707        # VCF header
3708        vcf_reader = self.get_header()
3709        log.debug("Initial header: " + str(vcf_reader.infos))
3710
3711        # Existing annotations
3712        for vcf_annotation in self.get_header().infos:
3713
3714            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3715            log.debug(
3716                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3717            )
3718
3719        if annotations:
3720
3721            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3722
3723                # Export VCF file
3724                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3725
3726                # Init
3727                commands = {}
3728
3729                for annotation in annotations:
3730                    annotation_fields = annotations[annotation]
3731
3732                    # Annotation Name
3733                    annotation_name = os.path.basename(annotation)
3734
3735                    if not annotation_fields:
3736                        annotation_fields = {"INFO": None}
3737
3738                    log.debug(f"Annotation '{annotation_name}'")
3739                    log.debug(
3740                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3741                    )
3742
3743                    # Create Database
3744                    database = Database(
3745                        database=annotation,
3746                        databases_folders=databases_folders,
3747                        assembly=assembly,
3748                    )
3749
3750                    # Find files
3751                    db_file = database.get_database()
3752                    db_file = full_path(db_file)
3753                    db_hdr_file = database.get_header_file()
3754                    db_hdr_file = full_path(db_hdr_file)
3755                    db_file_type = database.get_format()
3756                    db_tbi_file = f"{db_file}.tbi"
3757                    db_file_compressed = database.is_compressed()
3758
3759                    # Check if compressed
3760                    if not db_file_compressed:
3761                        log.error(
3762                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3763                        )
3764                        raise ValueError(
3765                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3766                        )
3767
3768                    # Check if indexed
3769                    if not os.path.exists(db_tbi_file):
3770                        log.error(
3771                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3772                        )
3773                        raise ValueError(
3774                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3775                        )
3776
3777                    # Check index - try to create if not exists
3778                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3779                        log.error("Annotation failed: database not valid")
3780                        log.error(f"Annotation annotation file: {db_file}")
3781                        log.error(f"Annotation annotation header: {db_hdr_file}")
3782                        log.error(f"Annotation annotation index: {db_tbi_file}")
3783                        raise ValueError(
3784                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3785                        )
3786                    else:
3787
3788                        log.debug(
3789                            f"Annotation '{annotation}' - file: "
3790                            + str(db_file)
3791                            + " and "
3792                            + str(db_hdr_file)
3793                        )
3794
3795                        # Load header as VCF object
3796                        db_hdr_vcf = Variants(input=db_hdr_file)
3797                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3798                        log.debug(
3799                            "Annotation database header: "
3800                            + str(db_hdr_vcf_header_infos)
3801                        )
3802
3803                        # For all fields in database
3804                        annotation_fields_full = False
3805                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3806                            annotation_fields = {
3807                                key: key for key in db_hdr_vcf_header_infos
3808                            }
3809                            log.debug(
3810                                "Annotation database header - All annotations added: "
3811                                + str(annotation_fields)
3812                            )
3813                            annotation_fields_full = True
3814
3815                        # # Create file for field rename
3816                        # log.debug("Create file for field rename")
3817                        # tmp_rename = NamedTemporaryFile(
3818                        #     prefix=self.get_prefix(),
3819                        #     dir=self.get_tmp_dir(),
3820                        #     suffix=".rename",
3821                        #     delete=False,
3822                        # )
3823                        # tmp_rename_name = tmp_rename.name
3824                        # tmp_files.append(tmp_rename_name)
3825
3826                        # Number of fields
3827                        nb_annotation_field = 0
3828                        annotation_list = []
3829                        annotation_infos_rename_list = []
3830
3831                        for annotation_field in annotation_fields:
3832
3833                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3834                            annotation_fields_new_name = annotation_fields.get(
3835                                annotation_field, annotation_field
3836                            )
3837                            if not annotation_fields_new_name:
3838                                annotation_fields_new_name = annotation_field
3839
3840                            # Check if field is in DB and if field is not elready in input data
3841                            if (
3842                                annotation_field in db_hdr_vcf.get_header().infos
3843                                and annotation_fields_new_name
3844                                not in self.get_header().infos
3845                            ):
3846
3847                                log.info(
3848                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3849                                )
3850
3851                                # BCFTools annotate param to rename fields
3852                                if annotation_field != annotation_fields_new_name:
3853                                    annotation_infos_rename_list.append(
3854                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3855                                    )
3856
3857                                # Add INFO field to header
3858                                db_hdr_vcf_header_infos_number = (
3859                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3860                                )
3861                                db_hdr_vcf_header_infos_type = (
3862                                    db_hdr_vcf_header_infos[annotation_field].type
3863                                    or "String"
3864                                )
3865                                db_hdr_vcf_header_infos_description = (
3866                                    db_hdr_vcf_header_infos[annotation_field].desc
3867                                    or f"{annotation_field} description"
3868                                )
3869                                db_hdr_vcf_header_infos_source = (
3870                                    db_hdr_vcf_header_infos[annotation_field].source
3871                                    or "unknown"
3872                                )
3873                                db_hdr_vcf_header_infos_version = (
3874                                    db_hdr_vcf_header_infos[annotation_field].version
3875                                    or "unknown"
3876                                )
3877
3878                                vcf_reader.infos[annotation_fields_new_name] = (
3879                                    vcf.parser._Info(
3880                                        annotation_fields_new_name,
3881                                        db_hdr_vcf_header_infos_number,
3882                                        db_hdr_vcf_header_infos_type,
3883                                        db_hdr_vcf_header_infos_description,
3884                                        db_hdr_vcf_header_infos_source,
3885                                        db_hdr_vcf_header_infos_version,
3886                                        self.code_type_map[
3887                                            db_hdr_vcf_header_infos_type
3888                                        ],
3889                                    )
3890                                )
3891
3892                                annotation_list.append(annotation_field)
3893
3894                                nb_annotation_field += 1
3895
3896                            else:
3897
3898                                if (
3899                                    annotation_field
3900                                    not in db_hdr_vcf.get_header().infos
3901                                ):
3902                                    log.warning(
3903                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3904                                    )
3905                                if (
3906                                    annotation_fields_new_name
3907                                    in self.get_header().infos
3908                                ):
3909                                    log.warning(
3910                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3911                                    )
3912
3913                        log.info(
3914                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3915                        )
3916
3917                        annotation_infos = ",".join(annotation_list)
3918
3919                        if annotation_infos != "":
3920
3921                            # Annotated VCF (and error file)
3922                            tmp_annotation_vcf_name = os.path.join(
3923                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3924                            )
3925                            tmp_annotation_vcf_name_err = (
3926                                tmp_annotation_vcf_name + ".err"
3927                            )
3928
3929                            # Add fields to annotate
3930                            if not annotation_fields_full:
3931                                annotation_infos_option = f"-info {annotation_infos}"
3932                            else:
3933                                annotation_infos_option = ""
3934
3935                            # Info fields rename
3936                            if annotation_infos_rename_list:
3937                                annotation_infos_rename = " -c " + ",".join(
3938                                    annotation_infos_rename_list
3939                                )
3940                            else:
3941                                annotation_infos_rename = ""
3942
3943                            # Annotate command
3944                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3945
3946                            # Add command
3947                            commands[command_annotate] = tmp_annotation_vcf_name
3948
3949                if commands:
3950
3951                    # Export VCF file
3952                    self.export_variant_vcf(
3953                        vcf_file=tmp_vcf_name,
3954                        remove_info=True,
3955                        add_samples=False,
3956                        index=True,
3957                    )
3958                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3959
3960                    # Num command
3961                    nb_command = 0
3962
3963                    # Annotate
3964                    for command_annotate in commands:
3965                        nb_command += 1
3966                        log.info(
3967                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3968                        )
3969                        log.debug(f"command_annotate={command_annotate}")
3970                        run_parallel_commands([command_annotate], threads)
3971
3972                        # Debug
3973                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3974
3975                        # Update variants
3976                        log.info(
3977                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3978                        )
3979                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3981    def annotation_bcftools(self, threads: int = None) -> None:
3982        """
3983        This function annotate with bcftools
3984
3985        :param threads: Number of threads to use
3986        :return: the value of the variable "return_value".
3987        """
3988
3989        # DEBUG
3990        log.debug("Start annotation with bcftools databases")
3991
3992        # Threads
3993        if not threads:
3994            threads = self.get_threads()
3995        log.debug("Threads: " + str(threads))
3996
3997        # Config
3998        config = self.get_config()
3999        log.debug("Config: " + str(config))
4000
4001        # DEBUG
4002        delete_tmp = True
4003        if self.get_config().get("verbosity", "warning") in ["debug"]:
4004            delete_tmp = False
4005            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4006
4007        # Config - BCFTools bin command
4008        bcftools_bin_command = get_bin_command(
4009            bin="bcftools",
4010            tool="bcftools",
4011            bin_type="bin",
4012            config=config,
4013            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
4014        )
4015        if not bcftools_bin_command:
4016            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
4017            log.error(msg_err)
4018            raise ValueError(msg_err)
4019
4020        # Config - BCFTools databases folders
4021        databases_folders = set(
4022            self.get_config()
4023            .get("folders", {})
4024            .get("databases", {})
4025            .get("annotations", ["."])
4026            + self.get_config()
4027            .get("folders", {})
4028            .get("databases", {})
4029            .get("bcftools", ["."])
4030        )
4031        log.debug("Databases annotations: " + str(databases_folders))
4032
4033        # Param
4034        annotations = (
4035            self.get_param()
4036            .get("annotation", {})
4037            .get("bcftools", {})
4038            .get("annotations", None)
4039        )
4040        log.debug("Annotations: " + str(annotations))
4041
4042        # Assembly
4043        assembly = self.get_param().get(
4044            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
4045        )
4046
4047        # Data
4048        table_variants = self.get_table_variants()
4049
4050        # Check if not empty
4051        log.debug("Check if not empty")
4052        sql_query_chromosomes = (
4053            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4054        )
4055        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
4056        if not sql_query_chromosomes_df["count"][0]:
4057            log.info(f"VCF empty")
4058            return
4059
4060        # Export in VCF
4061        log.debug("Create initial file to annotate")
4062        tmp_vcf = NamedTemporaryFile(
4063            prefix=self.get_prefix(),
4064            dir=self.get_tmp_dir(),
4065            suffix=".vcf.gz",
4066            delete=False,
4067        )
4068        tmp_vcf_name = tmp_vcf.name
4069
4070        # VCF header
4071        vcf_reader = self.get_header()
4072        log.debug("Initial header: " + str(vcf_reader.infos))
4073
4074        # Existing annotations
4075        for vcf_annotation in self.get_header().infos:
4076
4077            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4078            log.debug(
4079                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4080            )
4081
4082        if annotations:
4083
4084            tmp_ann_vcf_list = []
4085            commands = []
4086            tmp_files = []
4087            err_files = []
4088
4089            for annotation in annotations:
4090                annotation_fields = annotations[annotation]
4091
4092                # Annotation Name
4093                annotation_name = os.path.basename(annotation)
4094
4095                if not annotation_fields:
4096                    annotation_fields = {"INFO": None}
4097
4098                log.debug(f"Annotation '{annotation_name}'")
4099                log.debug(
4100                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
4101                )
4102
4103                # Create Database
4104                database = Database(
4105                    database=annotation,
4106                    databases_folders=databases_folders,
4107                    assembly=assembly,
4108                )
4109
4110                # Find files
4111                db_file = database.get_database()
4112                db_file = full_path(db_file)
4113                db_hdr_file = database.get_header_file()
4114                db_hdr_file = full_path(db_hdr_file)
4115                db_file_type = database.get_format()
4116                db_tbi_file = f"{db_file}.tbi"
4117                db_file_compressed = database.is_compressed()
4118
4119                # Check if compressed
4120                if not db_file_compressed:
4121                    log.error(
4122                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4123                    )
4124                    raise ValueError(
4125                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4126                    )
4127
4128                # Check if indexed
4129                if not os.path.exists(db_tbi_file):
4130                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
4131                    raise ValueError(
4132                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
4133                    )
4134
4135                # Check index - try to create if not exists
4136                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
4137                    log.error("Annotation failed: database not valid")
4138                    log.error(f"Annotation annotation file: {db_file}")
4139                    log.error(f"Annotation annotation header: {db_hdr_file}")
4140                    log.error(f"Annotation annotation index: {db_tbi_file}")
4141                    raise ValueError(
4142                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
4143                    )
4144                else:
4145
4146                    log.debug(
4147                        f"Annotation '{annotation}' - file: "
4148                        + str(db_file)
4149                        + " and "
4150                        + str(db_hdr_file)
4151                    )
4152
4153                    # Load header as VCF object
4154                    db_hdr_vcf = Variants(input=db_hdr_file)
4155                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
4156                    log.debug(
4157                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
4158                    )
4159
4160                    # For all fields in database
4161                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
4162                        annotation_fields = {
4163                            key: key for key in db_hdr_vcf_header_infos
4164                        }
4165                        log.debug(
4166                            "Annotation database header - All annotations added: "
4167                            + str(annotation_fields)
4168                        )
4169
4170                    # Number of fields
4171                    nb_annotation_field = 0
4172                    annotation_list = []
4173
4174                    for annotation_field in annotation_fields:
4175
4176                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
4177                        annotation_fields_new_name = annotation_fields.get(
4178                            annotation_field, annotation_field
4179                        )
4180                        if not annotation_fields_new_name:
4181                            annotation_fields_new_name = annotation_field
4182
4183                        # Check if field is in DB and if field is not elready in input data
4184                        if (
4185                            annotation_field in db_hdr_vcf.get_header().infos
4186                            and annotation_fields_new_name
4187                            not in self.get_header().infos
4188                        ):
4189
4190                            log.info(
4191                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
4192                            )
4193
4194                            # Add INFO field to header
4195                            db_hdr_vcf_header_infos_number = (
4196                                db_hdr_vcf_header_infos[annotation_field].num or "."
4197                            )
4198                            db_hdr_vcf_header_infos_type = (
4199                                db_hdr_vcf_header_infos[annotation_field].type
4200                                or "String"
4201                            )
4202                            db_hdr_vcf_header_infos_description = (
4203                                db_hdr_vcf_header_infos[annotation_field].desc
4204                                or f"{annotation_field} description"
4205                            )
4206                            db_hdr_vcf_header_infos_source = (
4207                                db_hdr_vcf_header_infos[annotation_field].source
4208                                or "unknown"
4209                            )
4210                            db_hdr_vcf_header_infos_version = (
4211                                db_hdr_vcf_header_infos[annotation_field].version
4212                                or "unknown"
4213                            )
4214
4215                            vcf_reader.infos[annotation_fields_new_name] = (
4216                                vcf.parser._Info(
4217                                    annotation_fields_new_name,
4218                                    db_hdr_vcf_header_infos_number,
4219                                    db_hdr_vcf_header_infos_type,
4220                                    db_hdr_vcf_header_infos_description,
4221                                    db_hdr_vcf_header_infos_source,
4222                                    db_hdr_vcf_header_infos_version,
4223                                    self.code_type_map[db_hdr_vcf_header_infos_type],
4224                                )
4225                            )
4226
4227                            # annotation_list.append(annotation_field)
4228                            if annotation_field != annotation_fields_new_name:
4229                                annotation_list.append(
4230                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
4231                                )
4232                            else:
4233                                annotation_list.append(annotation_field)
4234
4235                            nb_annotation_field += 1
4236
4237                        else:
4238
4239                            if annotation_field not in db_hdr_vcf.get_header().infos:
4240                                log.warning(
4241                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
4242                                )
4243                            if annotation_fields_new_name in self.get_header().infos:
4244                                log.warning(
4245                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
4246                                )
4247
4248                    log.info(
4249                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
4250                    )
4251
4252                    annotation_infos = ",".join(annotation_list)
4253
4254                    if annotation_infos != "":
4255
4256                        # Protect header for bcftools (remove "#CHROM" and variants line)
4257                        log.debug("Protect Header file - remove #CHROM line if exists")
4258                        tmp_header_vcf = NamedTemporaryFile(
4259                            prefix=self.get_prefix(),
4260                            dir=self.get_tmp_dir(),
4261                            suffix=".hdr",
4262                            delete=False,
4263                        )
4264                        tmp_header_vcf_name = tmp_header_vcf.name
4265                        tmp_files.append(tmp_header_vcf_name)
4266                        # Command
4267                        if db_hdr_file.endswith(".gz"):
4268                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4269                        else:
4270                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4271                        # Run
4272                        run_parallel_commands([command_extract_header], 1)
4273
4274                        # Find chomosomes
4275                        log.debug("Find chromosomes ")
4276                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
4277                        sql_query_chromosomes_df = self.get_query_to_df(
4278                            sql_query_chromosomes
4279                        )
4280                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
4281
4282                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
4283
4284                        # BED columns in the annotation file
4285                        if db_file_type in ["bed"]:
4286                            annotation_infos = "CHROM,POS,POS," + annotation_infos
4287
4288                        for chrom in chomosomes_list:
4289
4290                            # Create BED on initial VCF
4291                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
4292                            tmp_bed = NamedTemporaryFile(
4293                                prefix=self.get_prefix(),
4294                                dir=self.get_tmp_dir(),
4295                                suffix=".bed",
4296                                delete=False,
4297                            )
4298                            tmp_bed_name = tmp_bed.name
4299                            tmp_files.append(tmp_bed_name)
4300
4301                            # Detecte regions
4302                            log.debug(
4303                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
4304                            )
4305                            window = 1000000
4306                            sql_query_intervals_for_bed = f"""
4307                                SELECT  \"#CHROM\",
4308                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
4309                                        \"POS\"+{window}
4310                                FROM {table_variants} as table_variants
4311                                WHERE table_variants.\"#CHROM\" = '{chrom}'
4312                            """
4313                            regions = self.conn.execute(
4314                                sql_query_intervals_for_bed
4315                            ).fetchall()
4316                            merged_regions = merge_regions(regions)
4317                            log.debug(
4318                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
4319                            )
4320
4321                            header = ["#CHROM", "START", "END"]
4322                            with open(tmp_bed_name, "w") as f:
4323                                # Write the header with tab delimiter
4324                                f.write("\t".join(header) + "\n")
4325                                for d in merged_regions:
4326                                    # Write each data row with tab delimiter
4327                                    f.write("\t".join(map(str, d)) + "\n")
4328
4329                            # Tmp files
4330                            tmp_annotation_vcf = NamedTemporaryFile(
4331                                prefix=self.get_prefix(),
4332                                dir=self.get_tmp_dir(),
4333                                suffix=".vcf.gz",
4334                                delete=False,
4335                            )
4336                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
4337                            tmp_files.append(tmp_annotation_vcf_name)
4338                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
4339                            tmp_annotation_vcf_name_err = (
4340                                tmp_annotation_vcf_name + ".err"
4341                            )
4342                            err_files.append(tmp_annotation_vcf_name_err)
4343
4344                            # Annotate Command
4345                            log.debug(
4346                                f"Annotation '{annotation}' - add bcftools command"
4347                            )
4348
4349                            # Command
4350                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
4351
4352                            # Add command
4353                            commands.append(command_annotate)
4354
4355            # if some commands
4356            if commands:
4357
4358                # Export VCF file
4359                self.export_variant_vcf(
4360                    vcf_file=tmp_vcf_name,
4361                    remove_info=True,
4362                    add_samples=False,
4363                    index=True,
4364                )
4365
4366                # Threads
4367                # calculate threads for annotated commands
4368                if commands:
4369                    threads_bcftools_annotate = round(threads / len(commands))
4370                else:
4371                    threads_bcftools_annotate = 1
4372
4373                if not threads_bcftools_annotate:
4374                    threads_bcftools_annotate = 1
4375
4376                # Add threads option to bcftools commands
4377                if threads_bcftools_annotate > 1:
4378                    commands_threaded = []
4379                    for command in commands:
4380                        commands_threaded.append(
4381                            command.replace(
4382                                f"{bcftools_bin_command} annotate ",
4383                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
4384                            )
4385                        )
4386                    commands = commands_threaded
4387
4388                # Command annotation multithreading
4389                log.debug(f"Annotation - Annotation commands: " + str(commands))
4390                log.info(
4391                    f"Annotation - Annotation multithreaded in "
4392                    + str(len(commands))
4393                    + " commands"
4394                )
4395
4396                run_parallel_commands(commands, threads)
4397
4398                # Merge
4399                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
4400
4401                if tmp_ann_vcf_list_cmd:
4402
4403                    # Tmp file
4404                    tmp_annotate_vcf = NamedTemporaryFile(
4405                        prefix=self.get_prefix(),
4406                        dir=self.get_tmp_dir(),
4407                        suffix=".vcf.gz",
4408                        delete=True,
4409                    )
4410                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
4411                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4412                    err_files.append(tmp_annotate_vcf_name_err)
4413
4414                    # Tmp file remove command
4415                    tmp_files_remove_command = ""
4416                    if tmp_files:
4417                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
4418
4419                    # Command merge
4420                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
4421                    log.info(
4422                        f"Annotation - Annotation merging "
4423                        + str(len(commands))
4424                        + " annotated files"
4425                    )
4426                    log.debug(f"Annotation - merge command: {merge_command}")
4427                    run_parallel_commands([merge_command], 1)
4428
4429                    # Error messages
4430                    log.info(f"Error/Warning messages:")
4431                    error_message_command_all = []
4432                    error_message_command_warning = []
4433                    error_message_command_err = []
4434                    for err_file in err_files:
4435                        with open(err_file, "r") as f:
4436                            for line in f:
4437                                message = line.strip()
4438                                error_message_command_all.append(message)
4439                                if line.startswith("[W::"):
4440                                    error_message_command_warning.append(message)
4441                                if line.startswith("[E::"):
4442                                    error_message_command_err.append(
4443                                        f"{err_file}: " + message
4444                                    )
4445                    # log info
4446                    for message in list(
4447                        set(error_message_command_err + error_message_command_warning)
4448                    ):
4449                        log.info(f"   {message}")
4450                    # debug info
4451                    for message in list(set(error_message_command_all)):
4452                        log.debug(f"   {message}")
4453                    # failed
4454                    if len(error_message_command_err):
4455                        log.error("Annotation failed: Error in commands")
4456                        raise ValueError("Annotation failed: Error in commands")
4457
4458                    # Update variants
4459                    log.info(f"Annotation - Updating...")
4460                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
4462    def annotation_exomiser(self, threads: int = None) -> None:
4463        """
4464        This function annotate with Exomiser
4465
4466        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
4467        - "analysis" (dict/file):
4468            Full analysis dictionnary parameters (see Exomiser docs).
4469            Either a dict, or a file in JSON or YAML format.
4470            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
4471            Default : None
4472        - "preset" (string):
4473            Analysis preset (available in config folder).
4474            Used if no full "analysis" is provided.
4475            Default: "exome"
4476        - "phenopacket" (dict/file):
4477            Samples and phenotipic features parameters (see Exomiser docs).
4478            Either a dict, or a file in JSON or YAML format.
4479            Default: None
4480        - "subject" (dict):
4481            Sample parameters (see Exomiser docs).
4482            Example:
4483                "subject":
4484                    {
4485                        "id": "ISDBM322017",
4486                        "sex": "FEMALE"
4487                    }
4488            Default: None
4489        - "sample" (string):
4490            Sample name to construct "subject" section:
4491                "subject":
4492                    {
4493                        "id": "<sample>",
4494                        "sex": "UNKNOWN_SEX"
4495                    }
4496            Default: None
4497        - "phenotypicFeatures" (dict)
4498            Phenotypic features to construct "subject" section.
4499            Example:
4500                "phenotypicFeatures":
4501                    [
4502                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
4503                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
4504                    ]
4505        - "hpo" (list)
4506            List of HPO ids as phenotypic features.
4507            Example:
4508                "hpo": ['0001156', '0001363', '0011304', '0010055']
4509            Default: []
4510        - "outputOptions" (dict):
4511            Output options (see Exomiser docs).
4512            Default:
4513                "output_options" =
4514                    {
4515                        "outputContributingVariantsOnly": False,
4516                        "numGenes": 0,
4517                        "outputFormats": ["TSV_VARIANT", "VCF"]
4518                    }
4519        - "transcript_source" (string):
4520            Transcript source (either "refseq", "ucsc", "ensembl")
4521            Default: "refseq"
4522        - "exomiser_to_info" (boolean):
4523            Add exomiser TSV file columns as INFO fields in VCF.
4524            Default: False
4525        - "release" (string):
4526            Exomise database release.
4527            If not exists, database release will be downloaded (take a while).
4528            Default: None (provided by application.properties configuration file)
4529        - "exomiser_application_properties" (file):
4530            Exomiser configuration file (see Exomiser docs).
4531            Useful to automatically download databases (especially for specific genome databases).
4532
4533        Notes:
4534        - If no sample in parameters, first sample in VCF will be chosen
4535        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4536
4537        :param threads: The number of threads to use
4538        :return: None.
4539        """
4540
4541        # DEBUG
4542        log.debug("Start annotation with Exomiser databases")
4543
4544        # Threads
4545        if not threads:
4546            threads = self.get_threads()
4547        log.debug("Threads: " + str(threads))
4548
4549        # Config
4550        config = self.get_config()
4551        log.debug("Config: " + str(config))
4552
4553        # Config - Folders - Databases
4554        databases_folders = (
4555            config.get("folders", {})
4556            .get("databases", {})
4557            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4558        )
4559        databases_folders = full_path(databases_folders)
4560        if not os.path.exists(databases_folders):
4561            log.error(f"Databases annotations: {databases_folders} NOT found")
4562        log.debug("Databases annotations: " + str(databases_folders))
4563
4564        # Config - Exomiser
4565        exomiser_bin_command = get_bin_command(
4566            bin="exomiser-cli*.jar",
4567            tool="exomiser",
4568            bin_type="jar",
4569            config=config,
4570            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4571        )
4572        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4573        if not exomiser_bin_command:
4574            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4575            log.error(msg_err)
4576            raise ValueError(msg_err)
4577
4578        # Param
4579        param = self.get_param()
4580        log.debug("Param: " + str(param))
4581
4582        # Param - Exomiser
4583        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4584        log.debug(f"Param Exomiser: {param_exomiser}")
4585
4586        # Param - Assembly
4587        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4588        log.debug("Assembly: " + str(assembly))
4589
4590        # Data
4591        table_variants = self.get_table_variants()
4592
4593        # Check if not empty
4594        log.debug("Check if not empty")
4595        sql_query_chromosomes = (
4596            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4597        )
4598        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4599            log.info(f"VCF empty")
4600            return False
4601
4602        # VCF header
4603        vcf_reader = self.get_header()
4604        log.debug("Initial header: " + str(vcf_reader.infos))
4605
4606        # Samples
4607        samples = self.get_header_sample_list()
4608        if not samples:
4609            log.error("No Samples in VCF")
4610            return False
4611        log.debug(f"Samples: {samples}")
4612
4613        # Memory limit
4614        memory_limit = self.get_memory("8G")
4615        log.debug(f"memory_limit: {memory_limit}")
4616
4617        # Exomiser java options
4618        exomiser_java_options = (
4619            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4620        )
4621        log.debug(f"Exomiser java options: {exomiser_java_options}")
4622
4623        # Download Exomiser (if not exists)
4624        exomiser_release = param_exomiser.get("release", None)
4625        exomiser_application_properties = param_exomiser.get(
4626            "exomiser_application_properties", None
4627        )
4628        databases_download_exomiser(
4629            assemblies=[assembly],
4630            exomiser_folder=databases_folders,
4631            exomiser_release=exomiser_release,
4632            exomiser_phenotype_release=exomiser_release,
4633            exomiser_application_properties=exomiser_application_properties,
4634        )
4635
4636        # Force annotation
4637        force_update_annotation = True
4638
4639        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4640            log.debug("Start annotation Exomiser")
4641
4642            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4643
4644                # tmp_dir = "/tmp/exomiser"
4645
4646                ### ANALYSIS ###
4647                ################
4648
4649                # Create analysis.json through analysis dict
4650                # either analysis in param or by default
4651                # depending on preset exome/genome)
4652
4653                # Init analysis dict
4654                param_exomiser_analysis_dict = {}
4655
4656                # analysis from param
4657                param_exomiser_analysis = param_exomiser.get("analysis", {})
4658                param_exomiser_analysis = full_path(param_exomiser_analysis)
4659
4660                # If analysis in param -> load anlaysis json
4661                if param_exomiser_analysis:
4662
4663                    # If param analysis is a file and exists
4664                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4665                        param_exomiser_analysis
4666                    ):
4667                        # Load analysis file into analysis dict (either yaml or json)
4668                        with open(param_exomiser_analysis) as json_file:
4669                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4670
4671                    # If param analysis is a dict
4672                    elif isinstance(param_exomiser_analysis, dict):
4673                        # Load analysis dict into analysis dict (either yaml or json)
4674                        param_exomiser_analysis_dict = param_exomiser_analysis
4675
4676                    # Error analysis type
4677                    else:
4678                        log.error(f"Analysis type unknown. Check param file.")
4679                        raise ValueError(f"Analysis type unknown. Check param file.")
4680
4681                # Case no input analysis config file/dict
4682                # Use preset (exome/genome) to open default config file
4683                if not param_exomiser_analysis_dict:
4684
4685                    # default preset
4686                    default_preset = "exome"
4687
4688                    # Get param preset or default preset
4689                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4690
4691                    # Try to find if preset is a file
4692                    if os.path.exists(param_exomiser_preset):
4693                        # Preset file is provided in full path
4694                        param_exomiser_analysis_default_config_file = (
4695                            param_exomiser_preset
4696                        )
4697                    # elif os.path.exists(full_path(param_exomiser_preset)):
4698                    #     # Preset file is provided in full path
4699                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4700                    elif os.path.exists(
4701                        os.path.join(folder_config, param_exomiser_preset)
4702                    ):
4703                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4704                        param_exomiser_analysis_default_config_file = os.path.join(
4705                            folder_config, param_exomiser_preset
4706                        )
4707                    else:
4708                        # Construct preset file
4709                        param_exomiser_analysis_default_config_file = os.path.join(
4710                            folder_config,
4711                            f"preset-{param_exomiser_preset}-analysis.json",
4712                        )
4713
4714                    # If preset file exists
4715                    param_exomiser_analysis_default_config_file = full_path(
4716                        param_exomiser_analysis_default_config_file
4717                    )
4718                    if os.path.exists(param_exomiser_analysis_default_config_file):
4719                        # Load prest file into analysis dict (either yaml or json)
4720                        with open(
4721                            param_exomiser_analysis_default_config_file
4722                        ) as json_file:
4723                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4724                                json_file
4725                            )
4726
4727                    # Error preset file
4728                    else:
4729                        log.error(
4730                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4731                        )
4732                        raise ValueError(
4733                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4734                        )
4735
4736                # If no analysis dict created
4737                if not param_exomiser_analysis_dict:
4738                    log.error(f"No analysis config")
4739                    raise ValueError(f"No analysis config")
4740
4741                # Log
4742                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4743
4744                ### PHENOPACKET ###
4745                ###################
4746
4747                # If no PhenoPacket in analysis dict -> check in param
4748                if "phenopacket" not in param_exomiser_analysis_dict:
4749
4750                    # If PhenoPacket in param -> load anlaysis json
4751                    if param_exomiser.get("phenopacket", None):
4752
4753                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4754                        param_exomiser_phenopacket = full_path(
4755                            param_exomiser_phenopacket
4756                        )
4757
4758                        # If param phenopacket is a file and exists
4759                        if isinstance(
4760                            param_exomiser_phenopacket, str
4761                        ) and os.path.exists(param_exomiser_phenopacket):
4762                            # Load phenopacket file into analysis dict (either yaml or json)
4763                            with open(param_exomiser_phenopacket) as json_file:
4764                                param_exomiser_analysis_dict["phenopacket"] = (
4765                                    yaml.safe_load(json_file)
4766                                )
4767
4768                        # If param phenopacket is a dict
4769                        elif isinstance(param_exomiser_phenopacket, dict):
4770                            # Load phenopacket dict into analysis dict (either yaml or json)
4771                            param_exomiser_analysis_dict["phenopacket"] = (
4772                                param_exomiser_phenopacket
4773                            )
4774
4775                        # Error phenopacket type
4776                        else:
4777                            log.error(f"Phenopacket type unknown. Check param file.")
4778                            raise ValueError(
4779                                f"Phenopacket type unknown. Check param file."
4780                            )
4781
4782                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4783                if "phenopacket" not in param_exomiser_analysis_dict:
4784
4785                    # Init PhenoPacket
4786                    param_exomiser_analysis_dict["phenopacket"] = {
4787                        "id": "analysis",
4788                        "proband": {},
4789                    }
4790
4791                    ### Add subject ###
4792
4793                    # If subject exists
4794                    param_exomiser_subject = param_exomiser.get("subject", {})
4795
4796                    # If subject not exists -> found sample ID
4797                    if not param_exomiser_subject:
4798
4799                        # Found sample ID in param
4800                        sample = param_exomiser.get("sample", None)
4801
4802                        # Find sample ID (first sample)
4803                        if not sample:
4804                            sample_list = self.get_header_sample_list()
4805                            if len(sample_list) > 0:
4806                                sample = sample_list[0]
4807                            else:
4808                                log.error(f"No sample found")
4809                                raise ValueError(f"No sample found")
4810
4811                        # Create subject
4812                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4813
4814                    # Add to dict
4815                    param_exomiser_analysis_dict["phenopacket"][
4816                        "subject"
4817                    ] = param_exomiser_subject
4818
4819                    ### Add "phenotypicFeatures" ###
4820
4821                    # If phenotypicFeatures exists
4822                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4823                        "phenotypicFeatures", []
4824                    )
4825
4826                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4827                    if not param_exomiser_phenotypicfeatures:
4828
4829                        # Found HPO in param
4830                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4831
4832                        # Split HPO if list in string format separated by comma
4833                        if isinstance(param_exomiser_hpo, str):
4834                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4835
4836                        # Create HPO list
4837                        for hpo in param_exomiser_hpo:
4838                            hpo_clean = re.sub("[^0-9]", "", hpo)
4839                            param_exomiser_phenotypicfeatures.append(
4840                                {
4841                                    "type": {
4842                                        "id": f"HP:{hpo_clean}",
4843                                        "label": f"HP:{hpo_clean}",
4844                                    }
4845                                }
4846                            )
4847
4848                    # Add to dict
4849                    param_exomiser_analysis_dict["phenopacket"][
4850                        "phenotypicFeatures"
4851                    ] = param_exomiser_phenotypicfeatures
4852
4853                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4854                    if not param_exomiser_phenotypicfeatures:
4855                        for step in param_exomiser_analysis_dict.get(
4856                            "analysis", {}
4857                        ).get("steps", []):
4858                            if "hiPhivePrioritiser" in step:
4859                                param_exomiser_analysis_dict.get("analysis", {}).get(
4860                                    "steps", []
4861                                ).remove(step)
4862
4863                ### Add Input File ###
4864
4865                # Initial file name and htsFiles
4866                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4867                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4868                    {
4869                        "uri": tmp_vcf_name,
4870                        "htsFormat": "VCF",
4871                        "genomeAssembly": assembly,
4872                    }
4873                ]
4874
4875                ### Add metaData ###
4876
4877                # If metaData not in analysis dict
4878                if "metaData" not in param_exomiser_analysis_dict:
4879                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4880                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4881                        "createdBy": "howard",
4882                        "phenopacketSchemaVersion": 1,
4883                    }
4884
4885                ### OutputOptions ###
4886
4887                # Init output result folder
4888                output_results = os.path.join(tmp_dir, "results")
4889
4890                # If no outputOptions in analysis dict
4891                if "outputOptions" not in param_exomiser_analysis_dict:
4892
4893                    # default output formats
4894                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4895
4896                    # Get outputOptions in param
4897                    output_options = param_exomiser.get("outputOptions", None)
4898
4899                    # If no output_options in param -> check
4900                    if not output_options:
4901                        output_options = {
4902                            "outputContributingVariantsOnly": False,
4903                            "numGenes": 0,
4904                            "outputFormats": defaut_output_formats,
4905                        }
4906
4907                    # Replace outputDirectory in output options
4908                    output_options["outputDirectory"] = output_results
4909                    output_options["outputFileName"] = "howard"
4910
4911                    # Add outputOptions in analysis dict
4912                    param_exomiser_analysis_dict["outputOptions"] = output_options
4913
4914                else:
4915
4916                    # Replace output_results and output format (if exists in param)
4917                    param_exomiser_analysis_dict["outputOptions"][
4918                        "outputDirectory"
4919                    ] = output_results
4920                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4921                        list(
4922                            set(
4923                                param_exomiser_analysis_dict.get(
4924                                    "outputOptions", {}
4925                                ).get("outputFormats", [])
4926                                + ["TSV_VARIANT", "VCF"]
4927                            )
4928                        )
4929                    )
4930
4931                # log
4932                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4933
4934                ### ANALYSIS FILE ###
4935                #####################
4936
4937                ### Full JSON analysis config file ###
4938
4939                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4940                with open(exomiser_analysis, "w") as fp:
4941                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4942
4943                ### SPLIT analysis and sample config files
4944
4945                # Splitted analysis dict
4946                param_exomiser_analysis_dict_for_split = (
4947                    param_exomiser_analysis_dict.copy()
4948                )
4949
4950                # Phenopacket JSON file
4951                exomiser_analysis_phenopacket = os.path.join(
4952                    tmp_dir, "analysis_phenopacket.json"
4953                )
4954                with open(exomiser_analysis_phenopacket, "w") as fp:
4955                    json.dump(
4956                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4957                        fp,
4958                        indent=4,
4959                    )
4960
4961                # Analysis JSON file without Phenopacket parameters
4962                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4963                exomiser_analysis_analysis = os.path.join(
4964                    tmp_dir, "analysis_analysis.json"
4965                )
4966                with open(exomiser_analysis_analysis, "w") as fp:
4967                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4968
4969                ### INITAL VCF file ###
4970                #######################
4971
4972                ### Create list of samples to use and include inti initial VCF file ####
4973
4974                # Subject (main sample)
4975                # Get sample ID in analysis dict
4976                sample_subject = (
4977                    param_exomiser_analysis_dict.get("phenopacket", {})
4978                    .get("subject", {})
4979                    .get("id", None)
4980                )
4981                sample_proband = (
4982                    param_exomiser_analysis_dict.get("phenopacket", {})
4983                    .get("proband", {})
4984                    .get("subject", {})
4985                    .get("id", None)
4986                )
4987                sample = []
4988                if sample_subject:
4989                    sample.append(sample_subject)
4990                if sample_proband:
4991                    sample.append(sample_proband)
4992
4993                # Get sample ID within Pedigree
4994                pedigree_persons_list = (
4995                    param_exomiser_analysis_dict.get("phenopacket", {})
4996                    .get("pedigree", {})
4997                    .get("persons", {})
4998                )
4999
5000                # Create list with all sample ID in pedigree (if exists)
5001                pedigree_persons = []
5002                for person in pedigree_persons_list:
5003                    pedigree_persons.append(person.get("individualId"))
5004
5005                # Concat subject sample ID and samples ID in pedigreesamples
5006                samples = list(set(sample + pedigree_persons))
5007
5008                # Check if sample list is not empty
5009                if not samples:
5010                    log.error(f"No samples found")
5011                    raise ValueError(f"No samples found")
5012
5013                # Create VCF with sample (either sample in param or first one by default)
5014                # Export VCF file
5015                self.export_variant_vcf(
5016                    vcf_file=tmp_vcf_name,
5017                    remove_info=True,
5018                    add_samples=True,
5019                    list_samples=samples,
5020                    index=False,
5021                )
5022
5023                ### Execute Exomiser ###
5024                ########################
5025
5026                # Init command
5027                exomiser_command = ""
5028
5029                # Command exomiser options
5030                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
5031
5032                # Release
5033                exomiser_release = param_exomiser.get("release", None)
5034                if exomiser_release:
5035                    # phenotype data version
5036                    exomiser_options += (
5037                        f" --exomiser.phenotype.data-version={exomiser_release} "
5038                    )
5039                    # data version
5040                    exomiser_options += (
5041                        f" --exomiser.{assembly}.data-version={exomiser_release} "
5042                    )
5043                    # variant white list
5044                    variant_white_list_file = (
5045                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
5046                    )
5047                    if os.path.exists(
5048                        os.path.join(
5049                            databases_folders, assembly, variant_white_list_file
5050                        )
5051                    ):
5052                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
5053
5054                # transcript_source
5055                transcript_source = param_exomiser.get(
5056                    "transcript_source", None
5057                )  # ucsc, refseq, ensembl
5058                if transcript_source:
5059                    exomiser_options += (
5060                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
5061                    )
5062
5063                # If analysis contain proband param
5064                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
5065                    "proband", {}
5066                ):
5067                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
5068
5069                # If no proband (usually uniq sample)
5070                else:
5071                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
5072
5073                # Log
5074                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
5075
5076                # Run command
5077                result = subprocess.call(
5078                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
5079                )
5080                if result:
5081                    log.error("Exomiser command failed")
5082                    raise ValueError("Exomiser command failed")
5083
5084                ### RESULTS ###
5085                ###############
5086
5087                ### Annotate with TSV fields ###
5088
5089                # Init result tsv file
5090                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
5091
5092                # Init result tsv file
5093                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
5094
5095                # Parse TSV file and explode columns in INFO field
5096                if exomiser_to_info and os.path.exists(output_results_tsv):
5097
5098                    # Log
5099                    log.debug("Exomiser columns to VCF INFO field")
5100
5101                    # Retrieve columns and types
5102                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
5103                    output_results_tsv_df = self.get_query_to_df(query)
5104                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
5105
5106                    # Init concat fields for update
5107                    sql_query_update_concat_fields = []
5108
5109                    # Fields to avoid
5110                    fields_to_avoid = [
5111                        "CONTIG",
5112                        "START",
5113                        "END",
5114                        "REF",
5115                        "ALT",
5116                        "QUAL",
5117                        "FILTER",
5118                        "GENOTYPE",
5119                    ]
5120
5121                    # List all columns to add into header
5122                    for header_column in output_results_tsv_columns:
5123
5124                        # If header column is enable
5125                        if header_column not in fields_to_avoid:
5126
5127                            # Header info type
5128                            header_info_type = "String"
5129                            header_column_df = output_results_tsv_df[header_column]
5130                            header_column_df_dtype = header_column_df.dtype
5131                            if header_column_df_dtype == object:
5132                                if (
5133                                    pd.to_numeric(header_column_df, errors="coerce")
5134                                    .notnull()
5135                                    .all()
5136                                ):
5137                                    header_info_type = "Float"
5138                            else:
5139                                header_info_type = "Integer"
5140
5141                            # Header info
5142                            characters_to_validate = ["-"]
5143                            pattern = "[" + "".join(characters_to_validate) + "]"
5144                            header_info_name = re.sub(
5145                                pattern,
5146                                "_",
5147                                f"Exomiser_{header_column}".replace("#", ""),
5148                            )
5149                            header_info_number = "."
5150                            header_info_description = (
5151                                f"Exomiser {header_column} annotation"
5152                            )
5153                            header_info_source = "Exomiser"
5154                            header_info_version = "unknown"
5155                            header_info_code = CODE_TYPE_MAP[header_info_type]
5156                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
5157                                header_info_name,
5158                                header_info_number,
5159                                header_info_type,
5160                                header_info_description,
5161                                header_info_source,
5162                                header_info_version,
5163                                header_info_code,
5164                            )
5165
5166                            # Add field to add for update to concat fields
5167                            sql_query_update_concat_fields.append(
5168                                f"""
5169                                CASE
5170                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
5171                                    THEN concat(
5172                                        '{header_info_name}=',
5173                                        table_parquet."{header_column}",
5174                                        ';'
5175                                        )
5176
5177                                    ELSE ''
5178                                END
5179                            """
5180                            )
5181
5182                    # Update query
5183                    sql_query_update = f"""
5184                        UPDATE {table_variants} as table_variants
5185                            SET INFO = concat(
5186                                            CASE
5187                                                WHEN INFO NOT IN ('', '.')
5188                                                THEN INFO
5189                                                ELSE ''
5190                                            END,
5191                                            CASE
5192                                                WHEN table_variants.INFO NOT IN ('','.')
5193                                                THEN ';'
5194                                                ELSE ''
5195                                            END,
5196                                            (
5197                                            SELECT 
5198                                                concat(
5199                                                    {",".join(sql_query_update_concat_fields)}
5200                                                )
5201                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
5202                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
5203                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
5204                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5205                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5206                                            )
5207                                        )
5208                            ;
5209                        """
5210
5211                    # Update
5212                    self.conn.execute(sql_query_update)
5213
5214                ### Annotate with VCF INFO field ###
5215
5216                # Init result VCF file
5217                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
5218
5219                # If VCF exists
5220                if os.path.exists(output_results_vcf):
5221
5222                    # Log
5223                    log.debug("Exomiser result VCF update variants")
5224
5225                    # Find Exomiser INFO field annotation in header
5226                    with gzip.open(output_results_vcf, "rt") as f:
5227                        header_list = self.read_vcf_header(f)
5228                    exomiser_vcf_header = vcf.Reader(
5229                        io.StringIO("\n".join(header_list))
5230                    )
5231
5232                    # Add annotation INFO field to header
5233                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
5234
5235                    # Update variants with VCF
5236                    self.update_from_vcf(output_results_vcf)
5237
5238        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
5240    def annotation_snpeff(self, threads: int = None) -> None:
5241        """
5242        This function annotate with snpEff
5243
5244        :param threads: The number of threads to use
5245        :return: the value of the variable "return_value".
5246        """
5247
5248        # DEBUG
5249        log.debug("Start annotation with snpeff databases")
5250
5251        # Threads
5252        if not threads:
5253            threads = self.get_threads()
5254        log.debug("Threads: " + str(threads))
5255
5256        # DEBUG
5257        delete_tmp = True
5258        if self.get_config().get("verbosity", "warning") in ["debug"]:
5259            delete_tmp = False
5260            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5261
5262        # Config
5263        config = self.get_config()
5264        log.debug("Config: " + str(config))
5265
5266        # Config - Folders - Databases
5267        databases_folders = (
5268            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
5269        )
5270        log.debug("Databases annotations: " + str(databases_folders))
5271
5272        # Config - snpEff bin command
5273        snpeff_bin_command = get_bin_command(
5274            bin="snpEff.jar",
5275            tool="snpeff",
5276            bin_type="jar",
5277            config=config,
5278            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
5279        )
5280        if not snpeff_bin_command:
5281            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
5282            log.error(msg_err)
5283            raise ValueError(msg_err)
5284
5285        # Config - snpEff databases
5286        snpeff_databases = (
5287            config.get("folders", {})
5288            .get("databases", {})
5289            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
5290        )
5291        snpeff_databases = full_path(snpeff_databases)
5292        if snpeff_databases is not None and snpeff_databases != "":
5293            log.debug(f"Create snpEff databases folder")
5294            if not os.path.exists(snpeff_databases):
5295                os.makedirs(snpeff_databases)
5296
5297        # Param
5298        param = self.get_param()
5299        log.debug("Param: " + str(param))
5300
5301        # Param
5302        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
5303        log.debug("Options: " + str(options))
5304
5305        # Param - Assembly
5306        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5307
5308        # Param - Options
5309        snpeff_options = (
5310            param.get("annotation", {}).get("snpeff", {}).get("options", "")
5311        )
5312        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
5313        snpeff_csvstats = (
5314            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
5315        )
5316        if snpeff_stats:
5317            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
5318            snpeff_stats = full_path(snpeff_stats)
5319            snpeff_options += f" -stats {snpeff_stats}"
5320        if snpeff_csvstats:
5321            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
5322            snpeff_csvstats = full_path(snpeff_csvstats)
5323            snpeff_options += f" -csvStats {snpeff_csvstats}"
5324
5325        # Data
5326        table_variants = self.get_table_variants()
5327
5328        # Check if not empty
5329        log.debug("Check if not empty")
5330        sql_query_chromosomes = (
5331            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5332        )
5333        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
5334        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5335            log.info(f"VCF empty")
5336            return
5337
5338        # Export in VCF
5339        log.debug("Create initial file to annotate")
5340        tmp_vcf = NamedTemporaryFile(
5341            prefix=self.get_prefix(),
5342            dir=self.get_tmp_dir(),
5343            suffix=".vcf.gz",
5344            delete=True,
5345        )
5346        tmp_vcf_name = tmp_vcf.name
5347
5348        # VCF header
5349        vcf_reader = self.get_header()
5350        log.debug("Initial header: " + str(vcf_reader.infos))
5351
5352        # Existing annotations
5353        for vcf_annotation in self.get_header().infos:
5354
5355            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5356            log.debug(
5357                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5358            )
5359
5360        # Memory limit
5361        # if config.get("memory", None):
5362        #     memory_limit = config.get("memory", "8G")
5363        # else:
5364        #     memory_limit = "8G"
5365        memory_limit = self.get_memory("8G")
5366        log.debug(f"memory_limit: {memory_limit}")
5367
5368        # snpEff java options
5369        snpeff_java_options = (
5370            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
5371        )
5372        log.debug(f"Exomiser java options: {snpeff_java_options}")
5373
5374        force_update_annotation = True
5375
5376        if "ANN" not in self.get_header().infos or force_update_annotation:
5377
5378            # Check snpEff database
5379            log.debug(f"Check snpEff databases {[assembly]}")
5380            databases_download_snpeff(
5381                folder=snpeff_databases, assemblies=[assembly], config=config
5382            )
5383
5384            # Export VCF file
5385            self.export_variant_vcf(
5386                vcf_file=tmp_vcf_name,
5387                remove_info=True,
5388                add_samples=False,
5389                index=True,
5390            )
5391
5392            # Tmp file
5393            err_files = []
5394            tmp_annotate_vcf = NamedTemporaryFile(
5395                prefix=self.get_prefix(),
5396                dir=self.get_tmp_dir(),
5397                suffix=".vcf",
5398                delete=False,
5399            )
5400            tmp_annotate_vcf_name = tmp_annotate_vcf.name
5401            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5402            err_files.append(tmp_annotate_vcf_name_err)
5403
5404            # Command
5405            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
5406            log.debug(f"Annotation - snpEff command: {snpeff_command}")
5407            run_parallel_commands([snpeff_command], 1)
5408
5409            # Error messages
5410            log.info(f"Error/Warning messages:")
5411            error_message_command_all = []
5412            error_message_command_warning = []
5413            error_message_command_err = []
5414            for err_file in err_files:
5415                with open(err_file, "r") as f:
5416                    for line in f:
5417                        message = line.strip()
5418                        error_message_command_all.append(message)
5419                        if line.startswith("[W::"):
5420                            error_message_command_warning.append(message)
5421                        if line.startswith("[E::"):
5422                            error_message_command_err.append(f"{err_file}: " + message)
5423            # log info
5424            for message in list(
5425                set(error_message_command_err + error_message_command_warning)
5426            ):
5427                log.info(f"   {message}")
5428            # debug info
5429            for message in list(set(error_message_command_all)):
5430                log.debug(f"   {message}")
5431            # failed
5432            if len(error_message_command_err):
5433                log.error("Annotation failed: Error in commands")
5434                raise ValueError("Annotation failed: Error in commands")
5435
5436            # Find annotation in header
5437            with open(tmp_annotate_vcf_name, "rt") as f:
5438                header_list = self.read_vcf_header(f)
5439            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5440
5441            for ann in annovar_vcf_header.infos:
5442                if ann not in self.get_header().infos:
5443                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5444
5445            # Update variants
5446            log.info(f"Annotation - Updating...")
5447            self.update_from_vcf(tmp_annotate_vcf_name)
5448
5449        else:
5450            if "ANN" in self.get_header().infos:
5451                log.debug(f"Existing snpEff annotations in VCF")
5452            if force_update_annotation:
5453                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
5455    def annotation_annovar(self, threads: int = None) -> None:
5456        """
5457        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
5458        annotations
5459
5460        :param threads: number of threads to use
5461        :return: the value of the variable "return_value".
5462        """
5463
5464        # DEBUG
5465        log.debug("Start annotation with Annovar databases")
5466
5467        # Threads
5468        if not threads:
5469            threads = self.get_threads()
5470        log.debug("Threads: " + str(threads))
5471
5472        # Tmp en Err files
5473        tmp_files = []
5474        err_files = []
5475
5476        # DEBUG
5477        delete_tmp = True
5478        if self.get_config().get("verbosity", "warning") in ["debug"]:
5479            delete_tmp = False
5480            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5481
5482        # Config
5483        config = self.get_config()
5484        log.debug("Config: " + str(config))
5485
5486        # Config - Folders - Databases
5487        databases_folders = (
5488            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
5489        )
5490        log.debug("Databases annotations: " + str(databases_folders))
5491
5492        # Config - annovar bin command
5493        annovar_bin_command = get_bin_command(
5494            bin="table_annovar.pl",
5495            tool="annovar",
5496            bin_type="perl",
5497            config=config,
5498            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5499        )
5500        if not annovar_bin_command:
5501            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5502            log.error(msg_err)
5503            raise ValueError(msg_err)
5504
5505        # Config - BCFTools bin command
5506        bcftools_bin_command = get_bin_command(
5507            bin="bcftools",
5508            tool="bcftools",
5509            bin_type="bin",
5510            config=config,
5511            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5512        )
5513        if not bcftools_bin_command:
5514            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5515            log.error(msg_err)
5516            raise ValueError(msg_err)
5517
5518        # Config - annovar databases
5519        annovar_databases = (
5520            config.get("folders", {})
5521            .get("databases", {})
5522            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5523        )
5524        if annovar_databases is not None:
5525            if isinstance(annovar_databases, list):
5526                annovar_databases = full_path(annovar_databases[0])
5527                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
5528            annovar_databases = full_path(annovar_databases)
5529            if not os.path.exists(annovar_databases):
5530                log.info(f"Annovar databases folder '{annovar_databases}' created")
5531                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
5532        else:
5533            msg_err = f"Annovar databases configuration failed"
5534            log.error(msg_err)
5535            raise ValueError(msg_err)
5536
5537        # Param
5538        param = self.get_param()
5539        log.debug("Param: " + str(param))
5540
5541        # Param - options
5542        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5543        log.debug("Options: " + str(options))
5544
5545        # Param - annotations
5546        annotations = (
5547            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5548        )
5549        log.debug("Annotations: " + str(annotations))
5550
5551        # Param - Assembly
5552        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5553
5554        # Annovar database assembly
5555        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5556        if annovar_databases_assembly != "" and not os.path.exists(
5557            annovar_databases_assembly
5558        ):
5559            os.makedirs(annovar_databases_assembly)
5560
5561        # Data
5562        table_variants = self.get_table_variants()
5563
5564        # Check if not empty
5565        log.debug("Check if not empty")
5566        sql_query_chromosomes = (
5567            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5568        )
5569        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5570        if not sql_query_chromosomes_df["count"][0]:
5571            log.info(f"VCF empty")
5572            return
5573
5574        # VCF header
5575        vcf_reader = self.get_header()
5576        log.debug("Initial header: " + str(vcf_reader.infos))
5577
5578        # Existing annotations
5579        for vcf_annotation in self.get_header().infos:
5580
5581            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5582            log.debug(
5583                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5584            )
5585
5586        force_update_annotation = True
5587
5588        if annotations:
5589
5590            commands = []
5591            tmp_annotates_vcf_name_list = []
5592
5593            # Export in VCF
5594            log.debug("Create initial file to annotate")
5595            tmp_vcf = NamedTemporaryFile(
5596                prefix=self.get_prefix(),
5597                dir=self.get_tmp_dir(),
5598                suffix=".vcf.gz",
5599                delete=False,
5600            )
5601            tmp_vcf_name = tmp_vcf.name
5602            tmp_files.append(tmp_vcf_name)
5603            tmp_files.append(tmp_vcf_name + ".tbi")
5604
5605            # Export VCF file
5606            self.export_variant_vcf(
5607                vcf_file=tmp_vcf_name,
5608                remove_info=".",
5609                add_samples=False,
5610                index=True,
5611            )
5612
5613            # Create file for field rename
5614            log.debug("Create file for field rename")
5615            tmp_rename = NamedTemporaryFile(
5616                prefix=self.get_prefix(),
5617                dir=self.get_tmp_dir(),
5618                suffix=".rename",
5619                delete=False,
5620            )
5621            tmp_rename_name = tmp_rename.name
5622            tmp_files.append(tmp_rename_name)
5623
5624            # Check Annovar database
5625            log.debug(
5626                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5627            )
5628            databases_download_annovar(
5629                folder=annovar_databases,
5630                files=list(annotations.keys()),
5631                assemblies=[assembly],
5632            )
5633
5634            for annotation in annotations:
5635                annotation_fields = annotations[annotation]
5636
5637                if not annotation_fields:
5638                    annotation_fields = {"INFO": None}
5639
5640                log.info(f"Annotations Annovar - database '{annotation}'")
5641                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5642
5643                # Tmp file for annovar
5644                err_files = []
5645                tmp_annotate_vcf_directory = TemporaryDirectory(
5646                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5647                )
5648                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5649                tmp_annotate_vcf_name_annovar = (
5650                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5651                )
5652                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5653                err_files.append(tmp_annotate_vcf_name_err)
5654                tmp_files.append(tmp_annotate_vcf_name_err)
5655
5656                # Tmp file final vcf annotated by annovar
5657                tmp_annotate_vcf = NamedTemporaryFile(
5658                    prefix=self.get_prefix(),
5659                    dir=self.get_tmp_dir(),
5660                    suffix=".vcf.gz",
5661                    delete=False,
5662                )
5663                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5664                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5665                tmp_files.append(tmp_annotate_vcf_name)
5666                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5667
5668                # Number of fields
5669                annotation_list = []
5670                annotation_renamed_list = []
5671
5672                for annotation_field in annotation_fields:
5673
5674                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5675                    annotation_fields_new_name = annotation_fields.get(
5676                        annotation_field, annotation_field
5677                    )
5678                    if not annotation_fields_new_name:
5679                        annotation_fields_new_name = annotation_field
5680
5681                    if (
5682                        force_update_annotation
5683                        or annotation_fields_new_name not in self.get_header().infos
5684                    ):
5685                        annotation_list.append(annotation_field)
5686                        annotation_renamed_list.append(annotation_fields_new_name)
5687                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5688                        log.warning(
5689                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5690                        )
5691
5692                    # Add rename info
5693                    run_parallel_commands(
5694                        [
5695                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5696                        ],
5697                        1,
5698                    )
5699
5700                # log.debug("fields_to_removed: " + str(fields_to_removed))
5701                log.debug("annotation_list: " + str(annotation_list))
5702
5703                # protocol
5704                protocol = annotation
5705
5706                # argument
5707                argument = ""
5708
5709                # operation
5710                operation = "f"
5711                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5712                    "ensGene"
5713                ):
5714                    operation = "g"
5715                    if options.get("genebase", None):
5716                        argument = f"""'{options.get("genebase","")}'"""
5717                elif annotation in ["cytoBand"]:
5718                    operation = "r"
5719
5720                # argument option
5721                argument_option = ""
5722                if argument != "":
5723                    argument_option = " --argument " + argument
5724
5725                # command options
5726                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5727                for option in options:
5728                    if option not in ["genebase"]:
5729                        command_options += f""" --{option}={options[option]}"""
5730
5731                # Command
5732
5733                # Command - Annovar
5734                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5735                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5736
5737                # Command - start pipe
5738                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5739
5740                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5741                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5742
5743                # Command - Special characters (refGene annotation)
5744                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5745
5746                # Command - Clean empty fields (with value ".")
5747                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5748
5749                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5750                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5751                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5752                    # for ann in annotation_renamed_list:
5753                    for ann in annotation_list:
5754                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5755
5756                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5757
5758                # Command - indexing
5759                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5760
5761                log.debug(f"Annotation - Annovar command: {command_annovar}")
5762                run_parallel_commands([command_annovar], 1)
5763
5764                # Error messages
5765                log.info(f"Error/Warning messages:")
5766                error_message_command_all = []
5767                error_message_command_warning = []
5768                error_message_command_err = []
5769                for err_file in err_files:
5770                    with open(err_file, "r") as f:
5771                        for line in f:
5772                            message = line.strip()
5773                            error_message_command_all.append(message)
5774                            if line.startswith("[W::") or line.startswith("WARNING"):
5775                                error_message_command_warning.append(message)
5776                            if line.startswith("[E::") or line.startswith("ERROR"):
5777                                error_message_command_err.append(
5778                                    f"{err_file}: " + message
5779                                )
5780                # log info
5781                for message in list(
5782                    set(error_message_command_err + error_message_command_warning)
5783                ):
5784                    log.info(f"   {message}")
5785                # debug info
5786                for message in list(set(error_message_command_all)):
5787                    log.debug(f"   {message}")
5788                # failed
5789                if len(error_message_command_err):
5790                    log.error("Annotation failed: Error in commands")
5791                    raise ValueError("Annotation failed: Error in commands")
5792
5793            if tmp_annotates_vcf_name_list:
5794
5795                # List of annotated files
5796                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5797
5798                # Tmp file
5799                tmp_annotate_vcf = NamedTemporaryFile(
5800                    prefix=self.get_prefix(),
5801                    dir=self.get_tmp_dir(),
5802                    suffix=".vcf.gz",
5803                    delete=False,
5804                )
5805                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5806                tmp_files.append(tmp_annotate_vcf_name)
5807                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5808                err_files.append(tmp_annotate_vcf_name_err)
5809                tmp_files.append(tmp_annotate_vcf_name_err)
5810
5811                # Command merge
5812                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5813                log.info(
5814                    f"Annotation Annovar - Annotation merging "
5815                    + str(len(tmp_annotates_vcf_name_list))
5816                    + " annotated files"
5817                )
5818                log.debug(f"Annotation - merge command: {merge_command}")
5819                run_parallel_commands([merge_command], 1)
5820
5821                # Find annotation in header
5822                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5823                    header_list = self.read_vcf_header(f)
5824                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5825
5826                for ann in annovar_vcf_header.infos:
5827                    if ann not in self.get_header().infos:
5828                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5829
5830                # Update variants
5831                log.info(f"Annotation Annovar - Updating...")
5832                self.update_from_vcf(tmp_annotate_vcf_name)
5833
5834            # Clean files
5835            # Tmp file remove command
5836            if True:
5837                tmp_files_remove_command = ""
5838                if tmp_files:
5839                    tmp_files_remove_command = " ".join(tmp_files)
5840                clean_command = f" rm -f {tmp_files_remove_command} "
5841                log.debug(f"Annotation Annovar - Annotation cleaning ")
5842                log.debug(f"Annotation - cleaning command: {clean_command}")
5843                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5846    def annotation_parquet(self, threads: int = None) -> None:
5847        """
5848        It takes a VCF file, and annotates it with a parquet file
5849
5850        :param threads: number of threads to use for the annotation
5851        :return: the value of the variable "result".
5852        """
5853
5854        # DEBUG
5855        log.debug("Start annotation with parquet databases")
5856
5857        # Threads
5858        if not threads:
5859            threads = self.get_threads()
5860        log.debug("Threads: " + str(threads))
5861
5862        # DEBUG
5863        delete_tmp = True
5864        if self.get_config().get("verbosity", "warning") in ["debug"]:
5865            delete_tmp = False
5866            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5867
5868        # Config
5869        databases_folders = set(
5870            self.get_config()
5871            .get("folders", {})
5872            .get("databases", {})
5873            .get("annotations", ["."])
5874            + self.get_config()
5875            .get("folders", {})
5876            .get("databases", {})
5877            .get("parquet", ["."])
5878        )
5879        log.debug("Databases annotations: " + str(databases_folders))
5880
5881        # Param
5882        annotations = (
5883            self.get_param()
5884            .get("annotation", {})
5885            .get("parquet", {})
5886            .get("annotations", None)
5887        )
5888        log.debug("Annotations: " + str(annotations))
5889
5890        # Assembly
5891        assembly = self.get_param().get(
5892            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5893        )
5894
5895        # Force Update Annotation
5896        force_update_annotation = (
5897            self.get_param()
5898            .get("annotation", {})
5899            .get("options", {})
5900            .get("annotations_update", False)
5901        )
5902        log.debug(f"force_update_annotation={force_update_annotation}")
5903        force_append_annotation = (
5904            self.get_param()
5905            .get("annotation", {})
5906            .get("options", {})
5907            .get("annotations_append", False)
5908        )
5909        log.debug(f"force_append_annotation={force_append_annotation}")
5910
5911        # Data
5912        table_variants = self.get_table_variants()
5913
5914        # Check if not empty
5915        log.debug("Check if not empty")
5916        sql_query_chromosomes_df = self.get_query_to_df(
5917            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5918        )
5919        if not sql_query_chromosomes_df["count"][0]:
5920            log.info(f"VCF empty")
5921            return
5922
5923        # VCF header
5924        vcf_reader = self.get_header()
5925        log.debug("Initial header: " + str(vcf_reader.infos))
5926
5927        # Nb Variants POS
5928        log.debug("NB Variants Start")
5929        nb_variants = self.conn.execute(
5930            f"SELECT count(*) AS count FROM variants"
5931        ).fetchdf()["count"][0]
5932        log.debug("NB Variants Stop")
5933
5934        # Existing annotations
5935        for vcf_annotation in self.get_header().infos:
5936
5937            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5938            log.debug(
5939                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5940            )
5941
5942        # Added columns
5943        added_columns = []
5944
5945        # drop indexes
5946        log.debug(f"Drop indexes...")
5947        self.drop_indexes()
5948
5949        if annotations:
5950
5951            if "ALL" in annotations:
5952
5953                all_param = annotations.get("ALL", {})
5954                all_param_formats = all_param.get("formats", None)
5955                all_param_releases = all_param.get("releases", None)
5956
5957                databases_infos_dict = self.scan_databases(
5958                    database_formats=all_param_formats,
5959                    database_releases=all_param_releases,
5960                )
5961                for database_infos in databases_infos_dict.keys():
5962                    if database_infos not in annotations:
5963                        annotations[database_infos] = {"INFO": None}
5964
5965            for annotation in annotations:
5966
5967                if annotation in ["ALL"]:
5968                    continue
5969
5970                # Annotation Name
5971                annotation_name = os.path.basename(annotation)
5972
5973                # Annotation fields
5974                annotation_fields = annotations[annotation]
5975                if not annotation_fields:
5976                    annotation_fields = {"INFO": None}
5977
5978                log.debug(f"Annotation '{annotation_name}'")
5979                log.debug(
5980                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5981                )
5982
5983                # Create Database
5984                database = Database(
5985                    database=annotation,
5986                    databases_folders=databases_folders,
5987                    assembly=assembly,
5988                )
5989
5990                # Find files
5991                parquet_file = database.get_database()
5992                parquet_hdr_file = database.get_header_file()
5993                parquet_type = database.get_type()
5994
5995                # Check if files exists
5996                if not parquet_file or not parquet_hdr_file:
5997                    msg_err_list = []
5998                    if not parquet_file:
5999                        msg_err_list.append(
6000                            f"Annotation failed: Annotation file not found"
6001                        )
6002                    if parquet_file and not parquet_hdr_file:
6003                        msg_err_list.append(
6004                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
6005                        )
6006
6007                    log.error(". ".join(msg_err_list))
6008                    raise ValueError(". ".join(msg_err_list))
6009                else:
6010                    # Get parquet connexion
6011                    parquet_sql_attach = database.get_sql_database_attach(
6012                        output="query"
6013                    )
6014                    if parquet_sql_attach:
6015                        self.conn.execute(parquet_sql_attach)
6016                    parquet_file_link = database.get_sql_database_link()
6017                    # Log
6018                    log.debug(
6019                        f"Annotation '{annotation_name}' - file: "
6020                        + str(parquet_file)
6021                        + " and "
6022                        + str(parquet_hdr_file)
6023                    )
6024
6025                    # Database full header columns
6026                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
6027                        parquet_hdr_file
6028                    )
6029                    # Log
6030                    log.debug(
6031                        "Annotation database header columns : "
6032                        + str(parquet_hdr_vcf_header_columns)
6033                    )
6034
6035                    # Load header as VCF object
6036                    parquet_hdr_vcf_header_infos = database.get_header().infos
6037                    # Log
6038                    log.debug(
6039                        "Annotation database header: "
6040                        + str(parquet_hdr_vcf_header_infos)
6041                    )
6042
6043                    # Get extra infos
6044                    parquet_columns = database.get_extra_columns()
6045                    # Log
6046                    log.debug("Annotation database Columns: " + str(parquet_columns))
6047
6048                    # Add extra columns if "ALL" in annotation_fields
6049                    # if "ALL" in annotation_fields:
6050                    #     allow_add_extra_column = True
6051                    if "ALL" in annotation_fields and database.get_extra_columns():
6052                        for extra_column in database.get_extra_columns():
6053                            if (
6054                                extra_column not in annotation_fields
6055                                and extra_column.replace("INFO/", "")
6056                                not in parquet_hdr_vcf_header_infos
6057                            ):
6058                                parquet_hdr_vcf_header_infos[extra_column] = (
6059                                    vcf.parser._Info(
6060                                        extra_column,
6061                                        ".",
6062                                        "String",
6063                                        f"{extra_column} description",
6064                                        "unknown",
6065                                        "unknown",
6066                                        self.code_type_map["String"],
6067                                    )
6068                                )
6069
6070                    # For all fields in database
6071                    annotation_fields_all = False
6072                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
6073                        annotation_fields_all = True
6074                        annotation_fields = {
6075                            key: key for key in parquet_hdr_vcf_header_infos
6076                        }
6077
6078                        log.debug(
6079                            "Annotation database header - All annotations added: "
6080                            + str(annotation_fields)
6081                        )
6082
6083                    # Init
6084
6085                    # List of annotation fields to use
6086                    sql_query_annotation_update_info_sets = []
6087
6088                    # List of annotation to agregate
6089                    sql_query_annotation_to_agregate = []
6090
6091                    # Number of fields
6092                    nb_annotation_field = 0
6093
6094                    # Annotation fields processed
6095                    annotation_fields_processed = []
6096
6097                    # Columns mapping
6098                    map_columns = database.map_columns(
6099                        columns=annotation_fields, prefixes=["INFO/"]
6100                    )
6101
6102                    # Query dict for fields to remove (update option)
6103                    query_dict_remove = {}
6104
6105                    # Fetch Anotation fields
6106                    for annotation_field in annotation_fields:
6107
6108                        # annotation_field_column
6109                        annotation_field_column = map_columns.get(
6110                            annotation_field, "INFO"
6111                        )
6112
6113                        # field new name, if parametered
6114                        annotation_fields_new_name = annotation_fields.get(
6115                            annotation_field, annotation_field
6116                        )
6117                        if not annotation_fields_new_name:
6118                            annotation_fields_new_name = annotation_field
6119
6120                        # To annotate
6121                        # force_update_annotation = True
6122                        # force_append_annotation = True
6123                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
6124                        if annotation_field in parquet_hdr_vcf_header_infos and (
6125                            force_update_annotation
6126                            or force_append_annotation
6127                            or (
6128                                annotation_fields_new_name
6129                                not in self.get_header().infos
6130                            )
6131                        ):
6132
6133                            # Add field to annotation to process list
6134                            annotation_fields_processed.append(
6135                                annotation_fields_new_name
6136                            )
6137
6138                            # explode infos for the field
6139                            annotation_fields_new_name_info_msg = ""
6140                            if (
6141                                force_update_annotation
6142                                and annotation_fields_new_name
6143                                in self.get_header().infos
6144                            ):
6145                                # Remove field from INFO
6146                                query = f"""
6147                                    UPDATE {table_variants} as table_variants
6148                                    SET INFO = REGEXP_REPLACE(
6149                                                concat(table_variants.INFO,''),
6150                                                ';*{annotation_fields_new_name}=[^;]*',
6151                                                ''
6152                                                )
6153                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
6154                                """
6155                                annotation_fields_new_name_info_msg = " [update]"
6156                                query_dict_remove[
6157                                    f"remove 'INFO/{annotation_fields_new_name}'"
6158                                ] = query
6159
6160                            # Sep between fields in INFO
6161                            nb_annotation_field += 1
6162                            if nb_annotation_field > 1:
6163                                annotation_field_sep = ";"
6164                            else:
6165                                annotation_field_sep = ""
6166
6167                            log.info(
6168                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
6169                            )
6170
6171                            # Add INFO field to header
6172                            parquet_hdr_vcf_header_infos_number = (
6173                                parquet_hdr_vcf_header_infos[annotation_field].num
6174                                or "."
6175                            )
6176                            parquet_hdr_vcf_header_infos_type = (
6177                                parquet_hdr_vcf_header_infos[annotation_field].type
6178                                or "String"
6179                            )
6180                            parquet_hdr_vcf_header_infos_description = (
6181                                parquet_hdr_vcf_header_infos[annotation_field].desc
6182                                or f"{annotation_field} description"
6183                            )
6184                            parquet_hdr_vcf_header_infos_source = (
6185                                parquet_hdr_vcf_header_infos[annotation_field].source
6186                                or "unknown"
6187                            )
6188                            parquet_hdr_vcf_header_infos_version = (
6189                                parquet_hdr_vcf_header_infos[annotation_field].version
6190                                or "unknown"
6191                            )
6192
6193                            vcf_reader.infos[annotation_fields_new_name] = (
6194                                vcf.parser._Info(
6195                                    annotation_fields_new_name,
6196                                    parquet_hdr_vcf_header_infos_number,
6197                                    parquet_hdr_vcf_header_infos_type,
6198                                    parquet_hdr_vcf_header_infos_description,
6199                                    parquet_hdr_vcf_header_infos_source,
6200                                    parquet_hdr_vcf_header_infos_version,
6201                                    self.code_type_map[
6202                                        parquet_hdr_vcf_header_infos_type
6203                                    ],
6204                                )
6205                            )
6206
6207                            # Append
6208                            if force_append_annotation:
6209                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
6210                            else:
6211                                query_case_when_append = ""
6212
6213                            # Annotation/Update query fields
6214                            # Found in INFO column
6215                            if (
6216                                annotation_field_column == "INFO"
6217                                and "INFO" in parquet_hdr_vcf_header_columns
6218                            ):
6219                                sql_query_annotation_update_info_sets.append(
6220                                    f"""
6221                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
6222                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
6223                                        ELSE ''
6224                                    END
6225                                """
6226                                )
6227                            # Found in a specific column
6228                            else:
6229                                sql_query_annotation_update_info_sets.append(
6230                                    f"""
6231                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
6232                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
6233                                        ELSE ''
6234                                    END
6235                                """
6236                                )
6237                                sql_query_annotation_to_agregate.append(
6238                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
6239                                )
6240
6241                        # Not to annotate
6242                        else:
6243
6244                            if force_update_annotation:
6245                                annotation_message = "forced"
6246                            else:
6247                                annotation_message = "skipped"
6248
6249                            if annotation_field not in parquet_hdr_vcf_header_infos:
6250                                log.warning(
6251                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
6252                                )
6253                            if annotation_fields_new_name in self.get_header().infos:
6254                                log.warning(
6255                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
6256                                )
6257
6258                    # Check if ALL fields have to be annotated. Thus concat all INFO field
6259                    # allow_annotation_full_info = True
6260                    allow_annotation_full_info = not force_append_annotation
6261
6262                    if parquet_type in ["regions"]:
6263                        allow_annotation_full_info = False
6264
6265                    if (
6266                        allow_annotation_full_info
6267                        and nb_annotation_field == len(annotation_fields)
6268                        and annotation_fields_all
6269                        and (
6270                            "INFO" in parquet_hdr_vcf_header_columns
6271                            and "INFO" in database.get_extra_columns()
6272                        )
6273                    ):
6274                        log.debug("Column INFO annotation enabled")
6275                        sql_query_annotation_update_info_sets = []
6276                        sql_query_annotation_update_info_sets.append(
6277                            f" table_parquet.INFO "
6278                        )
6279
6280                    if sql_query_annotation_update_info_sets:
6281
6282                        # Annotate
6283                        log.info(f"Annotation '{annotation_name}' - Annotation...")
6284
6285                        # Join query annotation update info sets for SQL
6286                        sql_query_annotation_update_info_sets_sql = ",".join(
6287                            sql_query_annotation_update_info_sets
6288                        )
6289
6290                        # Check chromosomes list (and variants infos)
6291                        sql_query_chromosomes = f"""
6292                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
6293                            FROM {table_variants} as table_variants
6294                            GROUP BY table_variants."#CHROM"
6295                            ORDER BY table_variants."#CHROM"
6296                            """
6297                        sql_query_chromosomes_df = self.conn.execute(
6298                            sql_query_chromosomes
6299                        ).df()
6300                        sql_query_chromosomes_dict = {
6301                            entry["CHROM"]: {
6302                                "count": entry["count_variants"],
6303                                "min": entry["min_variants"],
6304                                "max": entry["max_variants"],
6305                            }
6306                            for index, entry in sql_query_chromosomes_df.iterrows()
6307                        }
6308
6309                        # Init
6310                        nb_of_query = 0
6311                        nb_of_variant_annotated = 0
6312                        query_dict = query_dict_remove
6313
6314                        # for chrom in sql_query_chromosomes_df["CHROM"]:
6315                        for chrom in sql_query_chromosomes_dict:
6316
6317                            # Number of variant by chromosome
6318                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
6319                                chrom, {}
6320                            ).get("count", 0)
6321
6322                            log.debug(
6323                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
6324                            )
6325
6326                            # Annotation with regions database
6327                            if parquet_type in ["regions"]:
6328                                sql_query_annotation_from_clause = f"""
6329                                    FROM (
6330                                        SELECT 
6331                                            '{chrom}' AS \"#CHROM\",
6332                                            table_variants_from.\"POS\" AS \"POS\",
6333                                            {",".join(sql_query_annotation_to_agregate)}
6334                                        FROM {table_variants} as table_variants_from
6335                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
6336                                            table_parquet_from."#CHROM" = '{chrom}'
6337                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
6338                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
6339                                        )
6340                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
6341                                        GROUP BY table_variants_from.\"POS\"
6342                                        )
6343                                        as table_parquet
6344                                """
6345
6346                                sql_query_annotation_where_clause = """
6347                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
6348                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6349                                """
6350
6351                            # Annotation with variants database
6352                            else:
6353                                sql_query_annotation_from_clause = f"""
6354                                    FROM {parquet_file_link} as table_parquet
6355                                """
6356                                sql_query_annotation_where_clause = f"""
6357                                    table_variants."#CHROM" = '{chrom}'
6358                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
6359                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6360                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
6361                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
6362                                """
6363
6364                            # Create update query
6365                            sql_query_annotation_chrom_interval_pos = f"""
6366                                UPDATE {table_variants} as table_variants
6367                                    SET INFO = 
6368                                        concat(
6369                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6370                                                THEN table_variants.INFO
6371                                                ELSE ''
6372                                            END
6373                                            ,
6374                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6375                                                        AND (
6376                                                        concat({sql_query_annotation_update_info_sets_sql})
6377                                                        )
6378                                                        NOT IN ('','.') 
6379                                                    THEN ';'
6380                                                    ELSE ''
6381                                            END
6382                                            ,
6383                                            {sql_query_annotation_update_info_sets_sql}
6384                                            )
6385                                    {sql_query_annotation_from_clause}
6386                                    WHERE {sql_query_annotation_where_clause}
6387                                    ;
6388                                """
6389
6390                            # Add update query to dict
6391                            query_dict[
6392                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
6393                            ] = sql_query_annotation_chrom_interval_pos
6394
6395                        nb_of_query = len(query_dict)
6396                        num_query = 0
6397
6398                        # SET max_expression_depth TO x
6399                        self.conn.execute("SET max_expression_depth TO 10000")
6400
6401                        for query_name in query_dict:
6402                            query = query_dict[query_name]
6403                            num_query += 1
6404                            log.info(
6405                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
6406                            )
6407                            result = self.conn.execute(query)
6408                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
6409                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
6410                            log.info(
6411                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
6412                            )
6413
6414                        log.info(
6415                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
6416                        )
6417
6418                    else:
6419
6420                        log.info(
6421                            f"Annotation '{annotation_name}' - No Annotations available"
6422                        )
6423
6424                    log.debug("Final header: " + str(vcf_reader.infos))
6425
6426        # Remove added columns
6427        for added_column in added_columns:
6428            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
6430    def annotation_splice(self, threads: int = None) -> None:
6431        """
6432        This function annotate with snpEff
6433
6434        :param threads: The number of threads to use
6435        :return: the value of the variable "return_value".
6436        """
6437
6438        # DEBUG
6439        log.debug("Start annotation with splice tools")
6440
6441        # Threads
6442        if not threads:
6443            threads = self.get_threads()
6444        log.debug("Threads: " + str(threads))
6445
6446        # DEBUG
6447        delete_tmp = True
6448        if self.get_config().get("verbosity", "warning") in ["debug"]:
6449            delete_tmp = False
6450            log.debug("Delete tmp files/folders: " + str(delete_tmp))
6451
6452        # Config
6453        config = self.get_config()
6454        log.debug("Config: " + str(config))
6455        splice_config = config.get("tools", {}).get("splice", {})
6456        if not splice_config:
6457            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
6458            msg_err = "No Splice tool config"
6459            raise ValueError(msg_err)
6460        log.debug(f"splice_config: {splice_config}")
6461
6462        # Config - Folders - Databases
6463        databases_folders = (
6464            config.get("folders", {}).get("databases", {}).get("splice", ["."])
6465        )
6466        log.debug("Databases annotations: " + str(databases_folders))
6467
6468        # Splice docker image
6469        splice_docker_image = splice_config.get("docker").get("image")
6470
6471        # Pull splice image if it's not already there
6472        if not check_docker_image_exists(splice_docker_image):
6473            log.warning(
6474                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
6475            )
6476            try:
6477                command(f"docker pull {splice_config.get('docker').get('image')}")
6478            except subprocess.CalledProcessError:
6479                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
6480                log.error(msg_err)
6481                raise ValueError(msg_err)
6482
6483        # Config - splice databases
6484        splice_databases = (
6485            config.get("folders", {})
6486            .get("databases", {})
6487            .get("splice", DEFAULT_SPLICE_FOLDER)
6488        )
6489        splice_databases = full_path(splice_databases)
6490
6491        # Param
6492        param = self.get_param()
6493        log.debug("Param: " + str(param))
6494
6495        # Param
6496        options = param.get("annotation", {}).get("splice", {}).get("options", {})
6497        log.debug("Options: " + str(options))
6498
6499        # Data
6500        table_variants = self.get_table_variants()
6501
6502        # Check if not empty
6503        log.debug("Check if not empty")
6504        sql_query_chromosomes = (
6505            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
6506        )
6507        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6508            log.info("VCF empty")
6509            return None
6510
6511        # Export in VCF
6512        log.debug("Create initial file to annotate")
6513
6514        # Create output folder / work folder
6515        if options.get("output_folder", ""):
6516            output_folder = options.get("output_folder", "")
6517            if not os.path.exists(output_folder):
6518                Path(output_folder).mkdir(parents=True, exist_ok=True)
6519        else:
6520            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6521            if not os.path.exists(output_folder):
6522                Path(output_folder).mkdir(parents=True, exist_ok=True)
6523
6524        if options.get("workdir", ""):
6525            workdir = options.get("workdir", "")
6526        else:
6527            workdir = "/work"
6528
6529        # Create tmp VCF file
6530        tmp_vcf = NamedTemporaryFile(
6531            prefix=self.get_prefix(),
6532            dir=output_folder,
6533            suffix=".vcf",
6534            delete=False,
6535        )
6536        tmp_vcf_name = tmp_vcf.name
6537
6538        # VCF header
6539        header = self.get_header()
6540
6541        # Existing annotations
6542        for vcf_annotation in self.get_header().infos:
6543
6544            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6545            log.debug(
6546                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6547            )
6548
6549        # Memory limit
6550        if config.get("memory", None):
6551            memory_limit = config.get("memory", "8G").upper()
6552            # upper()
6553        else:
6554            memory_limit = "8G"
6555        log.debug(f"memory_limit: {memory_limit}")
6556
6557        # Check number of variants to annotate
6558        where_clause_regex_spliceai = r"SpliceAI_\w+"
6559        where_clause_regex_spip = r"SPiP_\w+"
6560        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6561        df_list_of_variants_to_annotate = self.get_query_to_df(
6562            query=f""" SELECT * FROM variants {where_clause} """
6563        )
6564        if len(df_list_of_variants_to_annotate) == 0:
6565            log.warning(
6566                f"No variants to annotate with splice. Variants probably already annotated with splice"
6567            )
6568            return None
6569        else:
6570            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6571
6572        # Export VCF file
6573        self.export_variant_vcf(
6574            vcf_file=tmp_vcf_name,
6575            remove_info=True,
6576            add_samples=True,
6577            index=False,
6578            where_clause=where_clause,
6579        )
6580        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
6581        if any(value for value in splice_config.values() if value is None):
6582            log.warning("At least one splice config parameter is empty")
6583            # exit annotation_splice
6584            return None
6585
6586        # Params in splice nf
6587        def check_values(dico: dict):
6588            """
6589            Ensure parameters for NF splice pipeline
6590            """
6591            for key, val in dico.items():
6592                if key == "genome":
6593                    if any(
6594                        assemb in options.get("genome", {})
6595                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6596                    ):
6597                        yield f"--{key} hg19"
6598                    elif any(
6599                        assemb in options.get("genome", {})
6600                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6601                    ):
6602                        yield f"--{key} hg38"
6603                elif (
6604                    (isinstance(val, str) and val)
6605                    or isinstance(val, int)
6606                    or isinstance(val, bool)
6607                ):
6608                    yield f"--{key} {val}"
6609
6610        # Genome
6611        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6612        options["genome"] = genome
6613        # NF params
6614        nf_params = []
6615        # Add options
6616        if options:
6617            log.debug(options)
6618            nf_params = list(check_values(options))
6619            log.debug(f"Splice NF params: {' '.join(nf_params)}")
6620        else:
6621            log.debug("No NF params provided")
6622        # Add threads
6623        if "threads" not in options.keys():
6624            nf_params.append(f"--threads {threads}")
6625        # Genome path
6626        genome_path = find_genome(
6627            config.get("folders", {})
6628            .get("databases", {})
6629            .get("genomes", DEFAULT_GENOME_FOLDER),
6630            file=f"{genome}.fa",
6631        )
6632        # Add genome path
6633        if not genome_path:
6634            raise ValueError(
6635                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6636            )
6637        else:
6638            log.debug(f"Genome: {genome_path}")
6639            nf_params.append(f"--genome_path {genome_path}")
6640
6641        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6642            """
6643            Setting up updated databases for SPiP and SpliceAI
6644            """
6645
6646            try:
6647
6648                # SpliceAI assembly transcriptome
6649                spliceai_assembly = os.path.join(
6650                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
6651                    options.get("genome"),
6652                    "transcriptome",
6653                )
6654                spip_assembly = options.get("genome")
6655
6656                spip = find(
6657                    f"transcriptome_{spip_assembly}.RData",
6658                    config.get("folders", {}).get("databases", {}).get("spip", {}),
6659                )
6660                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6661                log.debug(f"SPiP annotations: {spip}")
6662                log.debug(f"SpliceAI annotations: {spliceai}")
6663                if spip and spliceai:
6664                    return [
6665                        f"--spip_transcriptome {spip}",
6666                        f"--spliceai_transcriptome {spliceai}",
6667                    ]
6668                else:
6669                    log.warning(
6670                        "Can't find splice databases in configuration, use annotations file from image"
6671                    )
6672            except TypeError:
6673                log.warning(
6674                    "Can't find splice databases in configuration, use annotations file from image"
6675                )
6676                return []
6677
6678        # Add options, check if transcriptome option have already beend provided
6679        if (
6680            "spip_transcriptome" not in nf_params
6681            and "spliceai_transcriptome" not in nf_params
6682        ):
6683            splice_reference = splice_annotations(options, config)
6684            if splice_reference:
6685                nf_params.extend(splice_reference)
6686        # nf_params.append(f"--output_folder {output_folder}")
6687        random_uuid = f"HOWARD-SPLICE-{get_random()}"
6688        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6689        log.debug(cmd)
6690        splice_config["docker"]["command"] = cmd
6691
6692        # Ensure proxy is set
6693        proxy = [
6694            f"-e {var}={os.getenv(var)}"
6695            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
6696            if os.getenv(var) is not None
6697        ]
6698        docker_cmd = get_bin_command(
6699            tool="splice",
6700            bin_type="docker",
6701            config=config,
6702            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6703            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
6704        )
6705        # print(docker_cmd)
6706        # exit()
6707        # Docker debug
6708        # if splice_config.get("rm_container"):
6709        #     rm_container = "--rm"
6710        # else:
6711        #     rm_container = ""
6712        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6713        log.debug(docker_cmd)
6714        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6715        log.debug(res.stdout)
6716        if res.stderr:
6717            log.error(res.stderr)
6718        res.check_returncode()
6719        # Update variants
6720        log.info("Annotation - Updating...")
6721        # Test find output vcf
6722        log.debug(
6723            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6724        )
6725        output_vcf = []
6726        # Wrong folder to look in
6727        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6728            if (
6729                files
6730                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6731            ):
6732                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6733        # log.debug(os.listdir(options.get("output_folder")))
6734        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6735        if not output_vcf:
6736            log.debug(
6737                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6738            )
6739        else:
6740            # Get new header from annotated vcf
6741            log.debug(f"Initial header: {len(header.infos)} fields")
6742            # Create new header with splice infos
6743            new_vcf = Variants(input=output_vcf[0])
6744            new_vcf_header = new_vcf.get_header().infos
6745            for keys, infos in new_vcf_header.items():
6746                if keys not in header.infos.keys():
6747                    header.infos[keys] = infos
6748            log.debug(f"New header: {len(header.infos)} fields")
6749            log.debug(f"Splice tmp output: {output_vcf[0]}")
6750            self.update_from_vcf(output_vcf[0])
6751
6752        # Remove file
6753        remove_if_exists(output_vcf)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6759    def get_config_default(self, name: str) -> dict:
6760        """
6761        The function `get_config_default` returns a dictionary containing default configurations for
6762        various calculations and prioritizations.
6763
6764        :param name: The `get_config_default` function returns a dictionary containing default
6765        configurations for different calculations and prioritizations. The `name` parameter is used to
6766        specify which specific configuration to retrieve from the dictionary
6767        :type name: str
6768        :return: The function `get_config_default` returns a dictionary containing default configuration
6769        settings for different calculations and prioritizations. The specific configuration settings are
6770        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6771        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6772        returned. If there is no match, an empty dictionary is returned.
6773        """
6774
6775        config_default = {
6776            "calculations": {
6777                "variant_chr_pos_alt_ref": {
6778                    "type": "sql",
6779                    "name": "variant_chr_pos_alt_ref",
6780                    "description": "Create a variant ID with chromosome, position, alt and ref",
6781                    "available": False,
6782                    "output_column_name": "variant_chr_pos_alt_ref",
6783                    "output_column_type": "String",
6784                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6785                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6786                    "operation_info": True,
6787                },
6788                "VARTYPE": {
6789                    "type": "sql",
6790                    "name": "VARTYPE",
6791                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6792                    "available": True,
6793                    "table": "variants",
6794                    "output_column_name": "VARTYPE",
6795                    "output_column_type": "String",
6796                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6797                    "operation_query": """
6798                            CASE
6799                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6800                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6801                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6802                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6803                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6804                                ELSE 'UNDEFINED'
6805                            END
6806                            """,
6807                    "info_fields": ["SVTYPE"],
6808                    "operation_info": True,
6809                },
6810                "snpeff_hgvs": {
6811                    "type": "python",
6812                    "name": "snpeff_hgvs",
6813                    "description": "HGVS nomenclatures from snpEff annotation",
6814                    "available": True,
6815                    "function_name": "calculation_extract_snpeff_hgvs",
6816                    "function_params": ["snpeff_hgvs", "ANN"],
6817                },
6818                "snpeff_ann_explode": {
6819                    "type": "python",
6820                    "name": "snpeff_ann_explode",
6821                    "description": "Explode snpEff annotations with uniquify values",
6822                    "available": True,
6823                    "function_name": "calculation_snpeff_ann_explode",
6824                    "function_params": [False, "fields", "snpeff_", "ANN"],
6825                },
6826                "snpeff_ann_explode_uniquify": {
6827                    "type": "python",
6828                    "name": "snpeff_ann_explode_uniquify",
6829                    "description": "Explode snpEff annotations",
6830                    "available": True,
6831                    "function_name": "calculation_snpeff_ann_explode",
6832                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6833                },
6834                "snpeff_ann_explode_json": {
6835                    "type": "python",
6836                    "name": "snpeff_ann_explode_json",
6837                    "description": "Explode snpEff annotations in JSON format",
6838                    "available": True,
6839                    "function_name": "calculation_snpeff_ann_explode",
6840                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6841                },
6842                "NOMEN": {
6843                    "type": "python",
6844                    "name": "NOMEN",
6845                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
6846                    "available": True,
6847                    "function_name": "calculation_extract_nomen",
6848                    "function_params": [],
6849                },
6850                "RENAME_INFO_FIELDS": {
6851                    "type": "python",
6852                    "name": "RENAME_INFO_FIELDS",
6853                    "description": "Rename or remove INFO/tags",
6854                    "available": True,
6855                    "function_name": "calculation_rename_info_fields",
6856                    "function_params": [],
6857                },
6858                "FINDBYPIPELINE": {
6859                    "type": "python",
6860                    "name": "FINDBYPIPELINE",
6861                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6862                    "available": True,
6863                    "function_name": "calculation_find_by_pipeline",
6864                    "function_params": ["findbypipeline"],
6865                },
6866                "FINDBYSAMPLE": {
6867                    "type": "python",
6868                    "name": "FINDBYSAMPLE",
6869                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6870                    "available": True,
6871                    "function_name": "calculation_find_by_pipeline",
6872                    "function_params": ["findbysample"],
6873                },
6874                "GENOTYPECONCORDANCE": {
6875                    "type": "python",
6876                    "name": "GENOTYPECONCORDANCE",
6877                    "description": "Concordance of genotype for multi caller VCF",
6878                    "available": True,
6879                    "function_name": "calculation_genotype_concordance",
6880                    "function_params": [],
6881                },
6882                "BARCODE": {
6883                    "type": "python",
6884                    "name": "BARCODE",
6885                    "description": "BARCODE as VaRank tool",
6886                    "available": True,
6887                    "function_name": "calculation_barcode",
6888                    "function_params": [],
6889                },
6890                "BARCODEFAMILY": {
6891                    "type": "python",
6892                    "name": "BARCODEFAMILY",
6893                    "description": "BARCODEFAMILY as VaRank tool",
6894                    "available": True,
6895                    "function_name": "calculation_barcode_family",
6896                    "function_params": ["BCF"],
6897                },
6898                "TRIO": {
6899                    "type": "python",
6900                    "name": "TRIO",
6901                    "description": "Inheritance for a trio family",
6902                    "available": True,
6903                    "function_name": "calculation_trio",
6904                    "function_params": [],
6905                },
6906                "VAF": {
6907                    "type": "python",
6908                    "name": "VAF",
6909                    "description": "Variant Allele Frequency (VAF) harmonization",
6910                    "available": True,
6911                    "function_name": "calculation_vaf_normalization",
6912                    "function_params": [],
6913                },
6914                "VAF_stats": {
6915                    "type": "python",
6916                    "name": "VAF_stats",
6917                    "description": "Variant Allele Frequency (VAF) statistics",
6918                    "available": True,
6919                    "function_name": "calculation_genotype_stats",
6920                    "function_params": ["VAF"],
6921                },
6922                "DP_stats": {
6923                    "type": "python",
6924                    "name": "DP_stats",
6925                    "description": "Depth (DP) statistics",
6926                    "available": True,
6927                    "function_name": "calculation_genotype_stats",
6928                    "function_params": ["DP"],
6929                },
6930                "variant_id": {
6931                    "type": "python",
6932                    "name": "variant_id",
6933                    "description": "Variant ID generated from variant position and type",
6934                    "available": True,
6935                    "function_name": "calculation_variant_id",
6936                    "function_params": [],
6937                },
6938                "transcripts_json": {
6939                    "type": "python",
6940                    "name": "transcripts_json",
6941                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
6942                    "available": True,
6943                    "function_name": "calculation_transcripts_annotation",
6944                    "function_params": ["transcripts_json", None],
6945                },
6946                "transcripts_ann": {
6947                    "type": "python",
6948                    "name": "transcripts_ann",
6949                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
6950                    "available": True,
6951                    "function_name": "calculation_transcripts_annotation",
6952                    "function_params": [None, "transcripts_ann"],
6953                },
6954                "transcripts_annotations": {
6955                    "type": "python",
6956                    "name": "transcripts_annotations",
6957                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
6958                    "available": True,
6959                    "function_name": "calculation_transcripts_annotation",
6960                    "function_params": [None, None],
6961                },
6962                "transcripts_prioritization": {
6963                    "type": "python",
6964                    "name": "transcripts_prioritization",
6965                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6966                    "available": True,
6967                    "function_name": "calculation_transcripts_prioritization",
6968                    "function_params": [],
6969                },
6970                "transcripts_export": {
6971                    "type": "python",
6972                    "name": "transcripts_export",
6973                    "description": "Export transcripts table/view as a file (using param.json)",
6974                    "available": True,
6975                    "function_name": "calculation_transcripts_export",
6976                    "function_params": [],
6977                },
6978            },
6979            "prioritizations": {
6980                "default": {
6981                    "ANN2": [
6982                        {
6983                            "type": "contains",
6984                            "value": "HIGH",
6985                            "score": 5,
6986                            "flag": "PASS",
6987                            "comment": [
6988                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6989                            ],
6990                        },
6991                        {
6992                            "type": "contains",
6993                            "value": "MODERATE",
6994                            "score": 3,
6995                            "flag": "PASS",
6996                            "comment": [
6997                                "A non-disruptive variant that might change protein effectiveness"
6998                            ],
6999                        },
7000                        {
7001                            "type": "contains",
7002                            "value": "LOW",
7003                            "score": 0,
7004                            "flag": "FILTERED",
7005                            "comment": [
7006                                "Assumed to be mostly harmless or unlikely to change protein behavior"
7007                            ],
7008                        },
7009                        {
7010                            "type": "contains",
7011                            "value": "MODIFIER",
7012                            "score": 0,
7013                            "flag": "FILTERED",
7014                            "comment": [
7015                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
7016                            ],
7017                        },
7018                    ],
7019                }
7020            },
7021        }
7022
7023        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
7025    def get_config_json(
7026        self, name: str, config_dict: dict = {}, config_file: str = None
7027    ) -> dict:
7028        """
7029        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
7030        default values, a dictionary, and a file.
7031
7032        :param name: The `name` parameter in the `get_config_json` function is a string that represents
7033        the name of the configuration. It is used to identify and retrieve the configuration settings
7034        for a specific component or module
7035        :type name: str
7036        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
7037        dictionary that allows you to provide additional configuration settings or overrides. When you
7038        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
7039        the key is the configuration setting you want to override or
7040        :type config_dict: dict
7041        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
7042        specify the path to a configuration file that contains additional settings. If provided, the
7043        function will read the contents of this file and update the configuration dictionary with the
7044        values found in the file, overriding any existing values with the
7045        :type config_file: str
7046        :return: The function `get_config_json` returns a dictionary containing the configuration
7047        settings.
7048        """
7049
7050        # Create with default prioritizations
7051        config_default = self.get_config_default(name=name)
7052        configuration = config_default
7053        # log.debug(f"configuration={configuration}")
7054
7055        # Replace prioritizations from dict
7056        for config in config_dict:
7057            configuration[config] = config_dict[config]
7058
7059        # Replace prioritizations from file
7060        config_file = full_path(config_file)
7061        if config_file:
7062            if os.path.exists(config_file):
7063                with open(config_file) as config_file_content:
7064                    config_file_dict = yaml.safe_load(config_file_content)
7065                for config in config_file_dict:
7066                    configuration[config] = config_file_dict[config]
7067            else:
7068                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
7069                log.error(msg_error)
7070                raise ValueError(msg_error)
7071
7072        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
7074    def prioritization(
7075        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
7076    ) -> bool:
7077        """
7078        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
7079        prioritizes variants based on configured profiles and criteria.
7080
7081        :param table: The `table` parameter in the `prioritization` function is used to specify the name
7082        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
7083        a table name is provided, the method will prioritize the variants in that specific table
7084        :type table: str
7085        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
7086        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
7087        provided, the code will use a default prefix value of "PZ"
7088        :type pz_prefix: str
7089        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
7090        additional parameters specific to the prioritization process. These parameters can include
7091        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
7092        configurations needed for the prioritization of variants in a V
7093        :type pz_param: dict
7094        :return: A boolean value (True) is being returned from the `prioritization` function.
7095        """
7096
7097        # Config
7098        config = self.get_config()
7099
7100        # Param
7101        param = self.get_param()
7102
7103        # Prioritization param
7104        if pz_param is not None:
7105            prioritization_param = pz_param
7106        else:
7107            prioritization_param = param.get("prioritization", {})
7108
7109        # Configuration profiles
7110        prioritization_config_file = prioritization_param.get(
7111            "prioritization_config", None
7112        )
7113        prioritization_config_file = full_path(prioritization_config_file)
7114        prioritizations_config = self.get_config_json(
7115            name="prioritizations", config_file=prioritization_config_file
7116        )
7117
7118        # Prioritization prefix
7119        pz_prefix_default = "PZ"
7120        if pz_prefix is None:
7121            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
7122
7123        # Prioritization options
7124        profiles = prioritization_param.get("profiles", [])
7125        if isinstance(profiles, str):
7126            profiles = profiles.split(",")
7127        pzfields = prioritization_param.get(
7128            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
7129        )
7130        if isinstance(pzfields, str):
7131            pzfields = pzfields.split(",")
7132        default_profile = prioritization_param.get("default_profile", None)
7133        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
7134        prioritization_score_mode = prioritization_param.get(
7135            "prioritization_score_mode", "HOWARD"
7136        )
7137
7138        # Quick Prioritizations
7139        prioritizations = param.get("prioritizations", None)
7140        if prioritizations:
7141            log.info("Quick Prioritization:")
7142            for profile in prioritizations.split(","):
7143                if profile not in profiles:
7144                    profiles.append(profile)
7145                    log.info(f"   {profile}")
7146
7147        # If profile "ALL" provided, all profiles in the config profiles
7148        if "ALL" in profiles:
7149            profiles = list(prioritizations_config.keys())
7150
7151        for profile in profiles:
7152            if prioritizations_config.get(profile, None):
7153                log.debug(f"Profile '{profile}' configured")
7154            else:
7155                msg_error = f"Profile '{profile}' NOT configured"
7156                log.error(msg_error)
7157                raise ValueError(msg_error)
7158
7159        if profiles:
7160            log.info(f"Prioritization... ")
7161        else:
7162            log.debug(f"No profile defined")
7163            return False
7164
7165        if not default_profile and len(profiles):
7166            default_profile = profiles[0]
7167
7168        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
7169        log.debug("Profiles to check: " + str(list(profiles)))
7170
7171        # Variables
7172        if table is not None:
7173            table_variants = table
7174        else:
7175            table_variants = self.get_table_variants(clause="update")
7176        log.debug(f"Table to prioritize: {table_variants}")
7177
7178        # Added columns
7179        added_columns = []
7180
7181        # Create list of PZfields
7182        # List of PZFields
7183        list_of_pzfields_original = pzfields + [
7184            pzfield + pzfields_sep + profile
7185            for pzfield in pzfields
7186            for profile in profiles
7187        ]
7188        list_of_pzfields = []
7189        log.debug(f"{list_of_pzfields_original}")
7190
7191        # Remove existing PZfields to use if exists
7192        for pzfield in list_of_pzfields_original:
7193            if self.get_header().infos.get(pzfield, None) is None:
7194                list_of_pzfields.append(pzfield)
7195                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
7196            else:
7197                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
7198
7199        if list_of_pzfields:
7200
7201            # Explode Infos prefix
7202            explode_infos_prefix = self.get_explode_infos_prefix()
7203
7204            # PZfields tags description
7205            PZfields_INFOS = {
7206                f"{pz_prefix}Tags": {
7207                    "ID": f"{pz_prefix}Tags",
7208                    "Number": ".",
7209                    "Type": "String",
7210                    "Description": "Variant tags based on annotation criteria",
7211                },
7212                f"{pz_prefix}Score": {
7213                    "ID": f"{pz_prefix}Score",
7214                    "Number": 1,
7215                    "Type": "Integer",
7216                    "Description": "Variant score based on annotation criteria",
7217                },
7218                f"{pz_prefix}Flag": {
7219                    "ID": f"{pz_prefix}Flag",
7220                    "Number": 1,
7221                    "Type": "String",
7222                    "Description": "Variant flag based on annotation criteria",
7223                },
7224                f"{pz_prefix}Comment": {
7225                    "ID": f"{pz_prefix}Comment",
7226                    "Number": ".",
7227                    "Type": "String",
7228                    "Description": "Variant comment based on annotation criteria",
7229                },
7230                f"{pz_prefix}Infos": {
7231                    "ID": f"{pz_prefix}Infos",
7232                    "Number": ".",
7233                    "Type": "String",
7234                    "Description": "Variant infos based on annotation criteria",
7235                },
7236                f"{pz_prefix}Class": {
7237                    "ID": f"{pz_prefix}Class",
7238                    "Number": ".",
7239                    "Type": "String",
7240                    "Description": "Variant class based on annotation criteria",
7241                },
7242            }
7243
7244            # Create INFO fields if not exist
7245            for field in PZfields_INFOS:
7246                field_ID = PZfields_INFOS[field]["ID"]
7247                field_description = PZfields_INFOS[field]["Description"]
7248                if field_ID not in self.get_header().infos and field_ID in pzfields:
7249                    field_description = (
7250                        PZfields_INFOS[field]["Description"]
7251                        + f", profile {default_profile}"
7252                    )
7253                    self.get_header().infos[field_ID] = vcf.parser._Info(
7254                        field_ID,
7255                        PZfields_INFOS[field]["Number"],
7256                        PZfields_INFOS[field]["Type"],
7257                        field_description,
7258                        "unknown",
7259                        "unknown",
7260                        code_type_map[PZfields_INFOS[field]["Type"]],
7261                    )
7262
7263            # Create INFO fields if not exist for each profile
7264            for profile in prioritizations_config:
7265                if profile in profiles or profiles == []:
7266                    for field in PZfields_INFOS:
7267                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
7268                        field_description = (
7269                            PZfields_INFOS[field]["Description"]
7270                            + f", profile {profile}"
7271                        )
7272                        if (
7273                            field_ID not in self.get_header().infos
7274                            and field in pzfields
7275                        ):
7276                            self.get_header().infos[field_ID] = vcf.parser._Info(
7277                                field_ID,
7278                                PZfields_INFOS[field]["Number"],
7279                                PZfields_INFOS[field]["Type"],
7280                                field_description,
7281                                "unknown",
7282                                "unknown",
7283                                code_type_map[PZfields_INFOS[field]["Type"]],
7284                            )
7285
7286            # Header
7287            for pzfield in list_of_pzfields:
7288                if re.match(f"{pz_prefix}Score.*", pzfield):
7289                    added_column = self.add_column(
7290                        table_name=table_variants,
7291                        column_name=pzfield,
7292                        column_type="INTEGER",
7293                        default_value="0",
7294                    )
7295                elif re.match(f"{pz_prefix}Flag.*", pzfield):
7296                    added_column = self.add_column(
7297                        table_name=table_variants,
7298                        column_name=pzfield,
7299                        column_type="BOOLEAN",
7300                        default_value="1",
7301                    )
7302                elif re.match(f"{pz_prefix}Class.*", pzfield):
7303                    added_column = self.add_column(
7304                        table_name=table_variants,
7305                        column_name=pzfield,
7306                        column_type="VARCHAR[]",
7307                        default_value="null",
7308                    )
7309                else:
7310                    added_column = self.add_column(
7311                        table_name=table_variants,
7312                        column_name=pzfield,
7313                        column_type="STRING",
7314                        default_value="''",
7315                    )
7316                added_columns.append(added_column)
7317
7318            # Profiles
7319            if profiles:
7320
7321                # foreach profile in configuration file
7322                for profile in prioritizations_config:
7323
7324                    # If profile is asked in param, or ALL are asked (empty profile [])
7325                    if profile in profiles or profiles == []:
7326                        log.info(f"Profile '{profile}'")
7327
7328                        sql_set_info_option = ""
7329
7330                        sql_set_info = []
7331
7332                        # PZ fields set
7333
7334                        # PZScore
7335                        if (
7336                            f"{pz_prefix}Score{pzfields_sep}{profile}"
7337                            in list_of_pzfields
7338                        ):
7339                            sql_set_info.append(
7340                                f"""
7341                                    concat(
7342                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
7343                                        {pz_prefix}Score{pzfields_sep}{profile}
7344                                    ) 
7345                                """
7346                            )
7347                            if (
7348                                profile == default_profile
7349                                and f"{pz_prefix}Score" in list_of_pzfields
7350                            ):
7351                                sql_set_info.append(
7352                                    f"""
7353                                        concat(
7354                                            '{pz_prefix}Score=',
7355                                            {pz_prefix}Score{pzfields_sep}{profile}
7356                                        )
7357                                    """
7358                                )
7359
7360                        # PZFlag
7361                        if (
7362                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
7363                            in list_of_pzfields
7364                        ):
7365                            sql_set_info.append(
7366                                f"""
7367                                    concat(
7368                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
7369                                        CASE 
7370                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7371                                            THEN 'PASS'
7372                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7373                                            THEN 'FILTERED'
7374                                        END
7375                                    ) 
7376                                """
7377                            )
7378                            if (
7379                                profile == default_profile
7380                                and f"{pz_prefix}Flag" in list_of_pzfields
7381                            ):
7382                                sql_set_info.append(
7383                                    f"""
7384                                        concat(
7385                                            '{pz_prefix}Flag=',
7386                                            CASE 
7387                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7388                                                THEN 'PASS'
7389                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7390                                                THEN 'FILTERED'
7391                                            END
7392                                        )
7393                                    """
7394                                )
7395
7396                        # PZClass
7397                        if (
7398                            f"{pz_prefix}Class{pzfields_sep}{profile}"
7399                            in list_of_pzfields
7400                        ):
7401                            sql_set_info.append(
7402                                f"""
7403                                    concat(
7404                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
7405                                        CASE
7406                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7407                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7408                                            ELSE '.'
7409                                        END 
7410                                    )
7411                                    
7412                                """
7413                            )
7414                            if (
7415                                profile == default_profile
7416                                and f"{pz_prefix}Class" in list_of_pzfields
7417                            ):
7418                                sql_set_info.append(
7419                                    f"""
7420                                        concat(
7421                                            '{pz_prefix}Class=',
7422                                            CASE
7423                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7424                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7425                                                ELSE '.'
7426                                            END 
7427                                        )
7428                                    """
7429                                )
7430
7431                        # PZComment
7432                        if (
7433                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
7434                            in list_of_pzfields
7435                        ):
7436                            sql_set_info.append(
7437                                f"""
7438                                    CASE
7439                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7440                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
7441                                        ELSE ''
7442                                    END
7443                                """
7444                            )
7445                            if (
7446                                profile == default_profile
7447                                and f"{pz_prefix}Comment" in list_of_pzfields
7448                            ):
7449                                sql_set_info.append(
7450                                    f"""
7451                                        CASE
7452                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7453                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
7454                                            ELSE ''
7455                                        END
7456                                    """
7457                                )
7458
7459                        # PZInfos
7460                        if (
7461                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
7462                            in list_of_pzfields
7463                        ):
7464                            sql_set_info.append(
7465                                f"""
7466                                    CASE
7467                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7468                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
7469                                        ELSE ''
7470                                    END
7471                                """
7472                            )
7473                            if (
7474                                profile == default_profile
7475                                and f"{pz_prefix}Infos" in list_of_pzfields
7476                            ):
7477                                sql_set_info.append(
7478                                    f"""
7479                                        CASE
7480                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7481                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
7482                                            ELSE ''
7483                                        END
7484                                    """
7485                                )
7486
7487                        # Merge PZfields
7488                        sql_set_info_option = ""
7489                        sql_set_sep = ""
7490                        for sql_set in sql_set_info:
7491                            if sql_set_sep:
7492                                sql_set_info_option += f"""
7493                                    , concat('{sql_set_sep}', {sql_set})
7494                                """
7495                            else:
7496                                sql_set_info_option += f"""
7497                                    , {sql_set}
7498                                """
7499                            sql_set_sep = ";"
7500
7501                        sql_queries = []
7502                        for annotation in prioritizations_config[profile]:
7503
7504                            # skip special sections
7505                            if annotation.startswith("_"):
7506                                continue
7507
7508                            # For each criterions
7509                            for criterion in prioritizations_config[profile][
7510                                annotation
7511                            ]:
7512
7513                                # Criterion mode
7514                                criterion_mode = None
7515                                if np.any(
7516                                    np.isin(list(criterion.keys()), ["type", "value"])
7517                                ):
7518                                    criterion_mode = "operation"
7519                                elif np.any(
7520                                    np.isin(list(criterion.keys()), ["sql", "fields"])
7521                                ):
7522                                    criterion_mode = "sql"
7523                                log.debug(f"Criterion Mode: {criterion_mode}")
7524
7525                                # Criterion parameters
7526                                criterion_type = criterion.get("type", None)
7527                                criterion_value = criterion.get("value", None)
7528                                criterion_sql = criterion.get("sql", None)
7529                                criterion_fields = criterion.get("fields", None)
7530                                criterion_score = criterion.get("score", 0)
7531                                criterion_flag = criterion.get("flag", "PASS")
7532                                criterion_class = criterion.get("class", None)
7533                                criterion_flag_bool = criterion_flag == "PASS"
7534                                criterion_comment = (
7535                                    ", ".join(criterion.get("comment", []))
7536                                    .replace("'", "''")
7537                                    .replace(";", ",")
7538                                    .replace("\t", " ")
7539                                )
7540                                criterion_infos = (
7541                                    str(criterion)
7542                                    .replace("'", "''")
7543                                    .replace(";", ",")
7544                                    .replace("\t", " ")
7545                                )
7546
7547                                # SQL
7548                                if criterion_sql is not None and isinstance(
7549                                    criterion_sql, list
7550                                ):
7551                                    criterion_sql = " ".join(criterion_sql)
7552
7553                                # Fields and explode
7554                                if criterion_fields is None:
7555                                    criterion_fields = [annotation]
7556                                if not isinstance(criterion_fields, list):
7557                                    criterion_fields = str(criterion_fields).split(",")
7558
7559                                # Class
7560                                if criterion_class is not None and not isinstance(
7561                                    criterion_class, list
7562                                ):
7563                                    criterion_class = str(criterion_class).split(",")
7564
7565                                for annotation_field in criterion_fields:
7566
7567                                    # Explode specific annotation
7568                                    log.debug(
7569                                        f"Explode annotation '{annotation_field}'"
7570                                    )
7571                                    added_columns += self.explode_infos(
7572                                        prefix=explode_infos_prefix,
7573                                        fields=[annotation_field],
7574                                        table=table_variants,
7575                                    )
7576                                    extra_infos = self.get_extra_infos(
7577                                        table=table_variants
7578                                    )
7579
7580                                    # Check if annotation field is present
7581                                    if (
7582                                        f"{explode_infos_prefix}{annotation_field}"
7583                                        not in extra_infos
7584                                    ):
7585                                        msq_err = f"Annotation '{annotation_field}' not in data"
7586                                        log.error(msq_err)
7587                                        raise ValueError(msq_err)
7588                                    else:
7589                                        log.debug(
7590                                            f"Annotation '{annotation_field}' in data"
7591                                        )
7592
7593                                sql_set = []
7594                                sql_set_info = []
7595
7596                                # PZ fields set
7597
7598                                # PZScore
7599                                if (
7600                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7601                                    in list_of_pzfields
7602                                ):
7603                                    # VaRank prioritization score mode
7604                                    if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]:
7605                                        sql_set.append(
7606                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
7607                                        )
7608                                    # default HOWARD prioritization score mode
7609                                    else:
7610                                        sql_set.append(
7611                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7612                                        )
7613
7614                                # PZFlag
7615                                if (
7616                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7617                                    in list_of_pzfields
7618                                ):
7619                                    sql_set.append(
7620                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7621                                    )
7622
7623                                # PZClass
7624                                if (
7625                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
7626                                    in list_of_pzfields
7627                                    and criterion_class is not None
7628                                ):
7629                                    sql_set.append(
7630                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
7631                                    )
7632
7633                                # PZComment
7634                                if (
7635                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7636                                    in list_of_pzfields
7637                                ):
7638                                    sql_set.append(
7639                                        f"""
7640                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7641                                                concat(
7642                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7643                                                    CASE 
7644                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7645                                                        THEN ', '
7646                                                        ELSE ''
7647                                                    END,
7648                                                    '{criterion_comment}'
7649                                                )
7650                                        """
7651                                    )
7652
7653                                # PZInfos
7654                                if (
7655                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7656                                    in list_of_pzfields
7657                                ):
7658                                    sql_set.append(
7659                                        f"""
7660                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7661                                                concat(
7662                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7663                                                    '{criterion_infos}'
7664                                                )
7665                                        """
7666                                    )
7667                                sql_set_option = ",".join(sql_set)
7668
7669                                # Criterion and comparison
7670                                if sql_set_option:
7671
7672                                    if criterion_mode in ["operation"]:
7673
7674                                        try:
7675                                            float(criterion_value)
7676                                            sql_update = f"""
7677                                                UPDATE {table_variants}
7678                                                SET {sql_set_option}
7679                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7680                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7681                                            """
7682                                        except:
7683                                            contains_option = ""
7684                                            if criterion_type == "contains":
7685                                                contains_option = ".*"
7686                                            sql_update = f"""
7687                                                UPDATE {table_variants}
7688                                                SET {sql_set_option}
7689                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7690                                            """
7691                                        sql_queries.append(sql_update)
7692
7693                                    elif criterion_mode in ["sql"]:
7694
7695                                        sql_update = f"""
7696                                            UPDATE {table_variants}
7697                                            SET {sql_set_option}
7698                                            WHERE {criterion_sql}
7699                                        """
7700                                        sql_queries.append(sql_update)
7701
7702                                    else:
7703                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
7704                                        log.error(msg_err)
7705                                        raise ValueError(msg_err)
7706
7707                                else:
7708                                    log.warning(
7709                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7710                                    )
7711
7712                        # PZTags
7713                        if (
7714                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7715                            in list_of_pzfields
7716                        ):
7717
7718                            # Create PZFalgs value
7719                            pztags_value = ""
7720                            pztags_sep_default = ","
7721                            pztags_sep = ""
7722                            for pzfield in pzfields:
7723                                if pzfield not in [f"{pz_prefix}Tags"]:
7724                                    if (
7725                                        f"{pzfield}{pzfields_sep}{profile}"
7726                                        in list_of_pzfields
7727                                    ):
7728                                        if pzfield in [f"{pz_prefix}Flag"]:
7729                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7730                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7731                                                    THEN 'PASS'
7732                                                    ELSE 'FILTERED'
7733                                                END, '"""
7734                                        elif pzfield in [f"{pz_prefix}Class"]:
7735                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7736                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7737                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7738                                                    ELSE '.'
7739                                                END, '"""
7740                                        else:
7741                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7742                                        pztags_sep = pztags_sep_default
7743
7744                            # Add Query update for PZFlags
7745                            sql_update_pztags = f"""
7746                                UPDATE {table_variants}
7747                                SET INFO = concat(
7748                                        INFO,
7749                                        CASE WHEN INFO NOT in ('','.')
7750                                                THEN ';'
7751                                                ELSE ''
7752                                        END,
7753                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7754                                    )
7755                                """
7756                            sql_queries.append(sql_update_pztags)
7757
7758                            # Add Query update for PZFlags for default
7759                            if profile == default_profile:
7760                                sql_update_pztags_default = f"""
7761                                UPDATE {table_variants}
7762                                SET INFO = concat(
7763                                        INFO,
7764                                        ';',
7765                                        '{pz_prefix}Tags={pztags_value}'
7766                                    )
7767                                """
7768                                sql_queries.append(sql_update_pztags_default)
7769
7770                        log.info(f"""Profile '{profile}' - Prioritization... """)
7771
7772                        if sql_queries:
7773
7774                            for sql_query in sql_queries:
7775                                log.debug(
7776                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7777                                )
7778                                self.conn.execute(sql_query)
7779
7780                        log.info(f"""Profile '{profile}' - Update... """)
7781                        sql_query_update = f"""
7782                            UPDATE {table_variants}
7783                            SET INFO =  
7784                                concat(
7785                                    CASE
7786                                        WHEN INFO NOT IN ('','.')
7787                                        THEN concat(INFO, ';')
7788                                        ELSE ''
7789                                    END
7790                                    {sql_set_info_option}
7791                                )
7792                        """
7793                        self.conn.execute(sql_query_update)
7794
7795        else:
7796
7797            log.warning(f"No profiles in parameters")
7798
7799        # Remove added columns
7800        for added_column in added_columns:
7801            self.drop_column(column=added_column)
7802
7803        # Explode INFOS fields into table fields
7804        if self.get_explode_infos():
7805            self.explode_infos(
7806                prefix=self.get_explode_infos_prefix(),
7807                fields=self.get_explode_infos_fields(),
7808                force=True,
7809            )
7810
7811        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7817    def annotation_hgvs(self, threads: int = None) -> None:
7818        """
7819        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7820        coordinates and alleles.
7821
7822        :param threads: The `threads` parameter is an optional integer that specifies the number of
7823        threads to use for parallel processing. If no value is provided, it will default to the number
7824        of threads obtained from the `get_threads()` method
7825        :type threads: int
7826        """
7827
7828        # Function for each partition of the Dask Dataframe
7829        def partition_function(partition):
7830            """
7831            The function `partition_function` applies the `annotation_hgvs_partition` function to
7832            each row of a DataFrame called `partition`.
7833
7834            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7835            to be processed
7836            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7837            the "partition" dataframe along the axis 1.
7838            """
7839            return partition.apply(annotation_hgvs_partition, axis=1)
7840
7841        def annotation_hgvs_partition(row) -> str:
7842            """
7843            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7844            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7845
7846            :param row: A dictionary-like object that contains the values for the following keys:
7847            :return: a string that contains the HGVS names associated with the given row of data.
7848            """
7849
7850            chr = row["CHROM"]
7851            pos = row["POS"]
7852            ref = row["REF"]
7853            alt = row["ALT"]
7854
7855            # Find list of associated transcripts
7856            transcripts_list = list(
7857                polars_conn.execute(
7858                    f"""
7859                SELECT transcript
7860                FROM refseq_df
7861                WHERE CHROM='{chr}'
7862                AND POS={pos}
7863            """
7864                )["transcript"]
7865            )
7866
7867            # Full HGVS annotation in list
7868            hgvs_full_list = []
7869
7870            for transcript_name in transcripts_list:
7871
7872                # Transcript
7873                transcript = get_transcript(
7874                    transcripts=transcripts, transcript_name=transcript_name
7875                )
7876                # Exon
7877                if use_exon:
7878                    exon = transcript.find_exon_number(pos)
7879                else:
7880                    exon = None
7881                # Protein
7882                transcript_protein = None
7883                if use_protein or add_protein or full_format:
7884                    transcripts_protein = list(
7885                        polars_conn.execute(
7886                            f"""
7887                        SELECT protein
7888                        FROM refseqlink_df
7889                        WHERE transcript='{transcript_name}'
7890                        LIMIT 1
7891                    """
7892                        )["protein"]
7893                    )
7894                    if len(transcripts_protein):
7895                        transcript_protein = transcripts_protein[0]
7896
7897                # HGVS name
7898                hgvs_name = format_hgvs_name(
7899                    chr,
7900                    pos,
7901                    ref,
7902                    alt,
7903                    genome=genome,
7904                    transcript=transcript,
7905                    transcript_protein=transcript_protein,
7906                    exon=exon,
7907                    use_gene=use_gene,
7908                    use_protein=use_protein,
7909                    full_format=full_format,
7910                    use_version=use_version,
7911                    codon_type=codon_type,
7912                )
7913                hgvs_full_list.append(hgvs_name)
7914                if add_protein and not use_protein and not full_format:
7915                    hgvs_name = format_hgvs_name(
7916                        chr,
7917                        pos,
7918                        ref,
7919                        alt,
7920                        genome=genome,
7921                        transcript=transcript,
7922                        transcript_protein=transcript_protein,
7923                        exon=exon,
7924                        use_gene=use_gene,
7925                        use_protein=True,
7926                        full_format=False,
7927                        use_version=use_version,
7928                        codon_type=codon_type,
7929                    )
7930                    hgvs_full_list.append(hgvs_name)
7931
7932            # Create liste of HGVS annotations
7933            hgvs_full = ",".join(hgvs_full_list)
7934
7935            return hgvs_full
7936
7937        # Polars connexion
7938        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7939
7940        # Config
7941        config = self.get_config()
7942
7943        # Databases
7944        # Genome
7945        databases_genomes_folders = (
7946            config.get("folders", {})
7947            .get("databases", {})
7948            .get("genomes", DEFAULT_GENOME_FOLDER)
7949        )
7950        databases_genome = (
7951            config.get("folders", {}).get("databases", {}).get("genomes", "")
7952        )
7953        # refseq database folder
7954        databases_refseq_folders = (
7955            config.get("folders", {})
7956            .get("databases", {})
7957            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7958        )
7959        # refseq
7960        databases_refseq = config.get("databases", {}).get("refSeq", None)
7961        # refSeqLink
7962        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7963
7964        # Param
7965        param = self.get_param()
7966
7967        # Quick HGVS
7968        if "hgvs_options" in param and param.get("hgvs_options", ""):
7969            log.info(f"Quick HGVS Annotation:")
7970            if not param.get("hgvs", None):
7971                param["hgvs"] = {}
7972            for option in param.get("hgvs_options", "").split(","):
7973                option_var_val = option.split("=")
7974                option_var = option_var_val[0]
7975                if len(option_var_val) > 1:
7976                    option_val = option_var_val[1]
7977                else:
7978                    option_val = "True"
7979                if option_val.upper() in ["TRUE"]:
7980                    option_val = True
7981                elif option_val.upper() in ["FALSE"]:
7982                    option_val = False
7983                log.info(f"   {option_var}={option_val}")
7984                param["hgvs"][option_var] = option_val
7985
7986        # Check if HGVS annotation enabled
7987        if "hgvs" in param:
7988            log.info(f"HGVS Annotation... ")
7989            for hgvs_option in param.get("hgvs", {}):
7990                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7991        else:
7992            return
7993
7994        # HGVS Param
7995        param_hgvs = param.get("hgvs", {})
7996        use_exon = param_hgvs.get("use_exon", False)
7997        use_gene = param_hgvs.get("use_gene", False)
7998        use_protein = param_hgvs.get("use_protein", False)
7999        add_protein = param_hgvs.get("add_protein", False)
8000        full_format = param_hgvs.get("full_format", False)
8001        use_version = param_hgvs.get("use_version", False)
8002        codon_type = param_hgvs.get("codon_type", "3")
8003
8004        # refSseq refSeqLink
8005        databases_refseq = param_hgvs.get("refseq", databases_refseq)
8006        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
8007
8008        # Assembly
8009        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
8010
8011        # Genome
8012        genome_file = None
8013        if find_genome(databases_genome):
8014            genome_file = find_genome(databases_genome)
8015        else:
8016            genome_file = find_genome(
8017                genome_path=databases_genomes_folders, assembly=assembly
8018            )
8019        log.debug("Genome: " + str(genome_file))
8020
8021        # refSseq
8022        refseq_file = find_file_prefix(
8023            input_file=databases_refseq,
8024            prefix="ncbiRefSeq",
8025            folder=databases_refseq_folders,
8026            assembly=assembly,
8027        )
8028        log.debug("refSeq: " + str(refseq_file))
8029
8030        # refSeqLink
8031        refseqlink_file = find_file_prefix(
8032            input_file=databases_refseqlink,
8033            prefix="ncbiRefSeqLink",
8034            folder=databases_refseq_folders,
8035            assembly=assembly,
8036        )
8037        log.debug("refSeqLink: " + str(refseqlink_file))
8038
8039        # Threads
8040        if not threads:
8041            threads = self.get_threads()
8042        log.debug("Threads: " + str(threads))
8043
8044        # Variables
8045        table_variants = self.get_table_variants(clause="update")
8046
8047        # Get variants SNV and InDel only
8048        query_variants = f"""
8049            SELECT "#CHROM" AS CHROM, POS, REF, ALT
8050            FROM {table_variants}
8051            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
8052            """
8053        df_variants = self.get_query_to_df(query_variants)
8054
8055        # Added columns
8056        added_columns = []
8057
8058        # Add hgvs column in variants table
8059        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
8060        added_column = self.add_column(
8061            table_variants, hgvs_column_name, "STRING", default_value=None
8062        )
8063        added_columns.append(added_column)
8064
8065        log.debug(f"refSeq loading...")
8066        # refSeq in duckDB
8067        refseq_table = get_refseq_table(
8068            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
8069        )
8070        # Loading all refSeq in Dataframe
8071        refseq_query = f"""
8072            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
8073            FROM {refseq_table}
8074            JOIN df_variants ON (
8075                {refseq_table}.chrom = df_variants.CHROM
8076                AND {refseq_table}.txStart<=df_variants.POS
8077                AND {refseq_table}.txEnd>=df_variants.POS
8078            )
8079        """
8080        refseq_df = self.conn.query(refseq_query).pl()
8081
8082        if refseqlink_file:
8083            log.debug(f"refSeqLink loading...")
8084            # refSeqLink in duckDB
8085            refseqlink_table = get_refseq_table(
8086                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
8087            )
8088            # Loading all refSeqLink in Dataframe
8089            protacc_column = "protAcc_with_ver"
8090            mrnaacc_column = "mrnaAcc_with_ver"
8091            refseqlink_query = f"""
8092                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
8093                FROM {refseqlink_table} 
8094                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
8095                WHERE protAcc_without_ver IS NOT NULL
8096            """
8097            # Polars Dataframe
8098            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
8099
8100        # Read RefSeq transcripts into a python dict/model.
8101        log.debug(f"Transcripts loading...")
8102        with tempfile.TemporaryDirectory() as tmpdir:
8103            transcripts_query = f"""
8104                COPY (
8105                    SELECT {refseq_table}.*
8106                    FROM {refseq_table}
8107                    JOIN df_variants ON (
8108                        {refseq_table}.chrom=df_variants.CHROM
8109                        AND {refseq_table}.txStart<=df_variants.POS
8110                        AND {refseq_table}.txEnd>=df_variants.POS
8111                    )
8112                )
8113                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
8114            """
8115            self.conn.query(transcripts_query)
8116            with open(f"{tmpdir}/transcript.tsv") as infile:
8117                transcripts = read_transcripts(infile)
8118
8119        # Polars connexion
8120        polars_conn = pl.SQLContext(register_globals=True, eager=True)
8121
8122        log.debug("Genome loading...")
8123        # Read genome sequence using pyfaidx.
8124        genome = Fasta(genome_file)
8125
8126        log.debug("Start annotation HGVS...")
8127
8128        # Create
8129        # a Dask Dataframe from Pandas dataframe with partition as number of threads
8130        ddf = dd.from_pandas(df_variants, npartitions=threads)
8131
8132        # Use dask.dataframe.apply() to apply function on each partition
8133        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
8134
8135        # Convert Dask DataFrame to Pandas Dataframe
8136        df = ddf.compute()
8137
8138        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
8139        with tempfile.TemporaryDirectory() as tmpdir:
8140            df_parquet = os.path.join(tmpdir, "df.parquet")
8141            df.to_parquet(df_parquet)
8142
8143            # Update hgvs column
8144            update_variant_query = f"""
8145                UPDATE {table_variants}
8146                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
8147                FROM read_parquet('{df_parquet}') as df
8148                WHERE variants."#CHROM" = df.CHROM
8149                AND variants.POS = df.POS
8150                AND variants.REF = df.REF
8151                AND variants.ALT = df.ALT
8152                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
8153                """
8154            self.execute_query(update_variant_query)
8155
8156        # Update INFO column
8157        sql_query_update = f"""
8158            UPDATE {table_variants}
8159            SET INFO = 
8160                concat(
8161                    CASE 
8162                        WHEN INFO NOT IN ('','.')
8163                        THEN concat(INFO, ';')
8164                        ELSE ''
8165                    END,
8166                    'hgvs=',
8167                    {hgvs_column_name}
8168                )
8169            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
8170            """
8171        self.execute_query(sql_query_update)
8172
8173        # Add header
8174        HGVS_INFOS = {
8175            "hgvs": {
8176                "ID": "hgvs",
8177                "Number": ".",
8178                "Type": "String",
8179                "Description": f"HGVS annotatation with HOWARD",
8180            }
8181        }
8182
8183        for field in HGVS_INFOS:
8184            field_ID = HGVS_INFOS[field]["ID"]
8185            field_description = HGVS_INFOS[field]["Description"]
8186            self.get_header().infos[field_ID] = vcf.parser._Info(
8187                field_ID,
8188                HGVS_INFOS[field]["Number"],
8189                HGVS_INFOS[field]["Type"],
8190                field_description,
8191                "unknown",
8192                "unknown",
8193                code_type_map[HGVS_INFOS[field]["Type"]],
8194            )
8195
8196        # Remove added columns
8197        for added_column in added_columns:
8198            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
8204    def get_operations_help(
8205        self, operations_config_dict: dict = {}, operations_config_file: str = None
8206    ) -> list:
8207
8208        # Init
8209        operations_help = []
8210
8211        # operations
8212        operations = self.get_config_json(
8213            name="calculations",
8214            config_dict=operations_config_dict,
8215            config_file=operations_config_file,
8216        )
8217        for op in operations:
8218            op_name = operations[op].get("name", op).upper()
8219            op_description = operations[op].get("description", op_name)
8220            op_available = operations[op].get("available", False)
8221            if op_available:
8222                operations_help.append(f"   {op_name}: {op_description}")
8223
8224        # Sort operations
8225        operations_help.sort()
8226
8227        # insert header
8228        operations_help.insert(0, "Available calculation operations:")
8229
8230        # Return
8231        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
8233    def calculation(
8234        self,
8235        operations: dict = {},
8236        operations_config_dict: dict = {},
8237        operations_config_file: str = None,
8238    ) -> None:
8239        """
8240        It takes a list of operations, and for each operation, it checks if it's a python or sql
8241        operation, and then calls the appropriate function
8242
8243        param json example:
8244            "calculation": {
8245                "NOMEN": {
8246                    "options": {
8247                        "hgvs_field": "hgvs"
8248                    },
8249                "middle" : null
8250            }
8251        """
8252
8253        # Param
8254        param = self.get_param()
8255
8256        # CHeck operations config file
8257        if operations_config_file is None:
8258            operations_config_file = param.get("calculation", {}).get(
8259                "calculation_config", None
8260            )
8261
8262        # operations config
8263        operations_config = self.get_config_json(
8264            name="calculations",
8265            config_dict=operations_config_dict,
8266            config_file=operations_config_file,
8267        )
8268
8269        # Upper keys
8270        operations_config = {k.upper(): v for k, v in operations_config.items()}
8271
8272        # Calculations
8273
8274        # Operations from param
8275        operations = param.get("calculation", {}).get("calculations", operations)
8276
8277        # Quick calculation - add
8278        if param.get("calculations", None):
8279
8280            # List of operations
8281            calculations_list = [
8282                value.strip() for value in param.get("calculations", "").split(",")
8283            ]
8284
8285            # Log
8286            log.info(f"Quick Calculations:")
8287            for calculation_key in calculations_list:
8288                log.info(f"   {calculation_key}")
8289
8290            # Create tmp operations (to keep operation order)
8291            operations_tmp = {}
8292            for calculation_operation in calculations_list:
8293                if calculation_operation.upper() not in operations_tmp:
8294                    log.debug(
8295                        f"{calculation_operation}.upper() not in {operations_tmp}"
8296                    )
8297                    operations_tmp[calculation_operation.upper()] = {}
8298                    add_value_into_dict(
8299                        dict_tree=operations_tmp,
8300                        sections=[
8301                            calculation_operation.upper(),
8302                        ],
8303                        value=operations.get(calculation_operation.upper(), {}),
8304                    )
8305            # Add operations already in param
8306            for calculation_operation in operations:
8307                if calculation_operation not in operations_tmp:
8308                    operations_tmp[calculation_operation] = operations.get(
8309                        calculation_operation, {}
8310                    )
8311
8312            # Update operations in param
8313            operations = operations_tmp
8314
8315        # Operations for calculation
8316        if not operations:
8317            operations = param.get("calculation", {}).get("calculations", {})
8318
8319        if operations:
8320            log.info(f"Calculations...")
8321
8322        # For each operations
8323        for operation_name in operations:
8324            operation_name = operation_name.upper()
8325            if operation_name not in [""]:
8326                if operation_name in operations_config:
8327                    log.info(f"Calculation '{operation_name}'")
8328                    operation = operations_config[operation_name]
8329                    operation_type = operation.get("type", "sql")
8330                    if operation_type == "python":
8331                        self.calculation_process_function(
8332                            operation=operation, operation_name=operation_name
8333                        )
8334                    elif operation_type == "sql":
8335                        self.calculation_process_sql(
8336                            operation=operation, operation_name=operation_name
8337                        )
8338                    else:
8339                        log.error(
8340                            f"Operations config: Type '{operation_type}' NOT available"
8341                        )
8342                        raise ValueError(
8343                            f"Operations config: Type '{operation_type}' NOT available"
8344                        )
8345                else:
8346                    log.error(
8347                        f"Operations config: Calculation '{operation_name}' NOT available"
8348                    )
8349                    raise ValueError(
8350                        f"Operations config: Calculation '{operation_name}' NOT available"
8351                    )
8352
8353        # Explode INFOS fields into table fields
8354        if self.get_explode_infos():
8355            self.explode_infos(
8356                prefix=self.get_explode_infos_prefix(),
8357                fields=self.get_explode_infos_fields(),
8358                force=True,
8359            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
8361    def calculation_process_sql(
8362        self, operation: dict, operation_name: str = "unknown"
8363    ) -> None:
8364        """
8365        The `calculation_process_sql` function takes in a mathematical operation as a string and
8366        performs the operation, updating the specified table with the result.
8367
8368        :param operation: The `operation` parameter is a dictionary that contains information about the
8369        mathematical operation to be performed. It includes the following keys:
8370        :type operation: dict
8371        :param operation_name: The `operation_name` parameter is a string that represents the name of
8372        the mathematical operation being performed. It is used for logging and error handling purposes,
8373        defaults to unknown
8374        :type operation_name: str (optional)
8375        """
8376
8377        # Operation infos
8378        operation_name = operation.get("name", "unknown")
8379        log.debug(f"process SQL {operation_name}")
8380        output_column_name = operation.get("output_column_name", operation_name)
8381        output_column_type = operation.get("output_column_type", "String")
8382        prefix = operation.get("explode_infos_prefix", "")
8383        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
8384        output_column_description = operation.get(
8385            "output_column_description", f"{operation_name} operation"
8386        )
8387        operation_query = operation.get("operation_query", None)
8388        if isinstance(operation_query, list):
8389            operation_query = " ".join(operation_query)
8390        operation_info_fields = operation.get("info_fields", [])
8391        operation_info_fields_check = operation.get("info_fields_check", False)
8392        operation_info = operation.get("operation_info", True)
8393        operation_table = operation.get(
8394            "table", self.get_table_variants(clause="alter")
8395        )
8396
8397        # table variants
8398        if operation_table:
8399            table_variants = operation_table
8400        else:
8401            table_variants = self.get_table_variants(clause="alter")
8402
8403        if operation_query:
8404
8405            # Info fields check
8406            operation_info_fields_check_result = True
8407            if operation_info_fields_check:
8408                header_infos = self.get_header().infos
8409                for info_field in operation_info_fields:
8410                    operation_info_fields_check_result = (
8411                        operation_info_fields_check_result
8412                        and info_field in header_infos
8413                    )
8414
8415            # If info fields available
8416            if operation_info_fields_check_result:
8417
8418                # Added_columns
8419                added_columns = []
8420
8421                # Create VCF header field
8422                vcf_reader = self.get_header()
8423                vcf_reader.infos[output_column_name] = vcf.parser._Info(
8424                    output_column_name,
8425                    ".",
8426                    output_column_type,
8427                    output_column_description,
8428                    "howard calculation",
8429                    "0",
8430                    self.code_type_map.get(output_column_type),
8431                )
8432
8433                # Explode infos if needed
8434                log.debug(f"calculation_process_sql prefix {prefix}")
8435                added_columns += self.explode_infos(
8436                    prefix=prefix,
8437                    fields=[output_column_name] + operation_info_fields,
8438                    force=False,
8439                    table=table_variants,
8440                )
8441
8442                # Create column
8443                added_column = self.add_column(
8444                    table_name=table_variants,
8445                    column_name=prefix + output_column_name,
8446                    column_type=output_column_type_sql,
8447                    default_value="null",
8448                )
8449                added_columns.append(added_column)
8450
8451                # Operation calculation
8452                try:
8453
8454                    # Query to update calculation column
8455                    sql_update = f"""
8456                        UPDATE {table_variants}
8457                        SET "{prefix}{output_column_name}" = ({operation_query})
8458                    """
8459                    self.conn.execute(sql_update)
8460
8461                    # Add to INFO
8462                    if operation_info:
8463                        sql_update_info = f"""
8464                            UPDATE {table_variants}
8465                            SET "INFO" =
8466                                concat(
8467                                    CASE
8468                                        WHEN "INFO" IS NOT NULL
8469                                        THEN concat("INFO", ';')
8470                                        ELSE ''
8471                                    END,
8472                                    '{output_column_name}=',
8473                                    "{prefix}{output_column_name}"
8474                                )
8475                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
8476                        """
8477                        self.conn.execute(sql_update_info)
8478
8479                except:
8480                    log.error(
8481                        f"Operations config: Calculation '{operation_name}' query failed"
8482                    )
8483                    raise ValueError(
8484                        f"Operations config: Calculation '{operation_name}' query failed"
8485                    )
8486
8487                # Remove added columns
8488                for added_column in added_columns:
8489                    log.debug(f"added_column: {added_column}")
8490                    self.drop_column(column=added_column)
8491
8492            else:
8493                log.error(
8494                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8495                )
8496                raise ValueError(
8497                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8498                )
8499
8500        else:
8501            log.error(
8502                f"Operations config: Calculation '{operation_name}' query NOT defined"
8503            )
8504            raise ValueError(
8505                f"Operations config: Calculation '{operation_name}' query NOT defined"
8506            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
8508    def calculation_process_function(
8509        self, operation: dict, operation_name: str = "unknown"
8510    ) -> None:
8511        """
8512        The `calculation_process_function` takes in an operation dictionary and performs the specified
8513        function with the given parameters.
8514
8515        :param operation: The `operation` parameter is a dictionary that contains information about the
8516        operation to be performed. It has the following keys:
8517        :type operation: dict
8518        :param operation_name: The `operation_name` parameter is a string that represents the name of
8519        the operation being performed. It is used for logging purposes, defaults to unknown
8520        :type operation_name: str (optional)
8521        """
8522
8523        operation_name = operation["name"]
8524        log.debug(f"process Python {operation_name}")
8525        function_name = operation["function_name"]
8526        function_params = operation["function_params"]
8527        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
8529    def calculation_variant_id(self) -> None:
8530        """
8531        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
8532        updates the INFO field of a variants table with the variant ID.
8533        """
8534
8535        # variant_id annotation field
8536        variant_id_tag = self.get_variant_id_column()
8537        added_columns = [variant_id_tag]
8538
8539        # variant_id hgvs tags"
8540        vcf_infos_tags = {
8541            variant_id_tag: "howard variant ID annotation",
8542        }
8543
8544        # Variants table
8545        table_variants = self.get_table_variants()
8546
8547        # Header
8548        vcf_reader = self.get_header()
8549
8550        # Add variant_id to header
8551        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
8552            variant_id_tag,
8553            ".",
8554            "String",
8555            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
8556            "howard calculation",
8557            "0",
8558            self.code_type_map.get("String"),
8559        )
8560
8561        # Update
8562        sql_update = f"""
8563            UPDATE {table_variants}
8564            SET "INFO" = 
8565                concat(
8566                    CASE
8567                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8568                        THEN ''
8569                        ELSE concat("INFO", ';')
8570                    END,
8571                    '{variant_id_tag}=',
8572                    "{variant_id_tag}"
8573                )
8574        """
8575        self.conn.execute(sql_update)
8576
8577        # Remove added columns
8578        for added_column in added_columns:
8579            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
8581    def calculation_extract_snpeff_hgvs(
8582        self,
8583        snpeff_hgvs: str = "snpeff_hgvs",
8584        snpeff_field: str = "ANN",
8585    ) -> None:
8586        """
8587        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
8588        annotation field in a VCF file and adds them as a new column in the variants table.
8589
8590        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
8591        function is used to specify the name of the column that will store the HGVS nomenclatures
8592        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
8593        snpeff_hgvs
8594        :type snpeff_hgvs: str (optional)
8595        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
8596        function represents the field in the VCF file that contains SnpEff annotations. This field is
8597        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
8598        to ANN
8599        :type snpeff_field: str (optional)
8600        """
8601
8602        # Snpeff hgvs tags
8603        vcf_infos_tags = {
8604            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
8605        }
8606
8607        # Prefix
8608        prefix = self.get_explode_infos_prefix()
8609        if prefix:
8610            prefix = "INFO/"
8611
8612        # snpEff fields
8613        speff_ann_infos = prefix + snpeff_field
8614        speff_hgvs_infos = prefix + snpeff_hgvs
8615
8616        # Variants table
8617        table_variants = self.get_table_variants()
8618
8619        # Header
8620        vcf_reader = self.get_header()
8621
8622        # Add columns
8623        added_columns = []
8624
8625        # Explode HGVS field in column
8626        added_columns += self.explode_infos(fields=[snpeff_field])
8627
8628        if snpeff_field in vcf_reader.infos:
8629
8630            log.debug(vcf_reader.infos[snpeff_field])
8631
8632            # Extract ANN header
8633            ann_description = vcf_reader.infos[snpeff_field].desc
8634            pattern = r"'(.+?)'"
8635            match = re.search(pattern, ann_description)
8636            if match:
8637                ann_header_match = match.group(1).split(" | ")
8638                ann_header_desc = {}
8639                for i in range(len(ann_header_match)):
8640                    ann_header_info = "".join(
8641                        char for char in ann_header_match[i] if char.isalnum()
8642                    )
8643                    ann_header_desc[ann_header_info] = ann_header_match[i]
8644                if not ann_header_desc:
8645                    raise ValueError("Invalid header description format")
8646            else:
8647                raise ValueError("Invalid header description format")
8648
8649            # Create variant id
8650            variant_id_column = self.get_variant_id_column()
8651            added_columns += [variant_id_column]
8652
8653            # Create dataframe
8654            dataframe_snpeff_hgvs = self.get_query_to_df(
8655                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8656            )
8657
8658            # Create main NOMEN column
8659            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8660                speff_ann_infos
8661            ].apply(
8662                lambda x: extract_snpeff_hgvs(
8663                    str(x), header=list(ann_header_desc.values())
8664                )
8665            )
8666
8667            # Add snpeff_hgvs to header
8668            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8669                snpeff_hgvs,
8670                ".",
8671                "String",
8672                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8673                "howard calculation",
8674                "0",
8675                self.code_type_map.get("String"),
8676            )
8677
8678            # Update
8679            sql_update = f"""
8680                UPDATE variants
8681                SET "INFO" = 
8682                    concat(
8683                        CASE
8684                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8685                            THEN ''
8686                            ELSE concat("INFO", ';')
8687                        END,
8688                        CASE 
8689                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8690                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8691                            THEN concat(
8692                                    '{snpeff_hgvs}=',
8693                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8694                                )
8695                            ELSE ''
8696                        END
8697                    )
8698                FROM dataframe_snpeff_hgvs
8699                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8700
8701            """
8702            self.conn.execute(sql_update)
8703
8704            # Delete dataframe
8705            del dataframe_snpeff_hgvs
8706            gc.collect()
8707
8708        else:
8709
8710            log.warning(
8711                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8712            )
8713
8714        # Remove added columns
8715        for added_column in added_columns:
8716            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8718    def calculation_snpeff_ann_explode(
8719        self,
8720        uniquify: bool = True,
8721        output_format: str = "fields",
8722        output_prefix: str = "snpeff_",
8723        snpeff_field: str = "ANN",
8724    ) -> None:
8725        """
8726        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8727        exploding the HGVS field and updating variant information accordingly.
8728
8729        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8730        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8731        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8732        defaults to True
8733        :type uniquify: bool (optional)
8734        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8735        function specifies the format in which the output annotations will be generated. It has a
8736        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8737        format, defaults to fields
8738        :type output_format: str (optional)
8739        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8740        method is used to specify the prefix that will be added to the output annotations generated
8741        during the calculation process. This prefix helps to differentiate the newly added annotations
8742        from existing ones in the output data. By default, the, defaults to ANN_
8743        :type output_prefix: str (optional)
8744        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8745        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8746        field will be processed to explode the HGVS annotations and update the variant information
8747        accordingly, defaults to ANN
8748        :type snpeff_field: str (optional)
8749        """
8750
8751        # SnpEff annotation field
8752        snpeff_hgvs = "snpeff_ann_explode"
8753
8754        # Snpeff hgvs tags
8755        vcf_infos_tags = {
8756            snpeff_hgvs: "Explode snpEff annotations",
8757        }
8758
8759        # Prefix
8760        prefix = self.get_explode_infos_prefix()
8761        if prefix:
8762            prefix = "INFO/"
8763
8764        # snpEff fields
8765        speff_ann_infos = prefix + snpeff_field
8766        speff_hgvs_infos = prefix + snpeff_hgvs
8767
8768        # Variants table
8769        table_variants = self.get_table_variants()
8770
8771        # Header
8772        vcf_reader = self.get_header()
8773
8774        # Add columns
8775        added_columns = []
8776
8777        # Explode HGVS field in column
8778        added_columns += self.explode_infos(fields=[snpeff_field])
8779        log.debug(f"snpeff_field={snpeff_field}")
8780        log.debug(f"added_columns={added_columns}")
8781
8782        if snpeff_field in vcf_reader.infos:
8783
8784            # Extract ANN header
8785            ann_description = vcf_reader.infos[snpeff_field].desc
8786            pattern = r"'(.+?)'"
8787            match = re.search(pattern, ann_description)
8788            if match:
8789                ann_header_match = match.group(1).split(" | ")
8790                ann_header = []
8791                ann_header_desc = {}
8792                for i in range(len(ann_header_match)):
8793                    ann_header_info = "".join(
8794                        char for char in ann_header_match[i] if char.isalnum()
8795                    )
8796                    ann_header.append(ann_header_info)
8797                    ann_header_desc[ann_header_info] = ann_header_match[i]
8798                if not ann_header_desc:
8799                    raise ValueError("Invalid header description format")
8800            else:
8801                raise ValueError("Invalid header description format")
8802
8803            # Create variant id
8804            variant_id_column = self.get_variant_id_column()
8805            added_columns += [variant_id_column]
8806
8807            # Create dataframe
8808            dataframe_snpeff_hgvs = self.get_query_to_df(
8809                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8810            )
8811
8812            # Create snpEff columns
8813            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8814                speff_ann_infos
8815            ].apply(
8816                lambda x: explode_snpeff_ann(
8817                    str(x),
8818                    uniquify=uniquify,
8819                    output_format=output_format,
8820                    prefix=output_prefix,
8821                    header=list(ann_header_desc.values()),
8822                )
8823            )
8824
8825            # Header
8826            ann_annotations_prefix = ""
8827            if output_format.upper() in ["JSON"]:
8828                ann_annotations_prefix = f"{output_prefix}="
8829                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8830                    output_prefix,
8831                    ".",
8832                    "String",
8833                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8834                    + " - JSON format",
8835                    "howard calculation",
8836                    "0",
8837                    self.code_type_map.get("String"),
8838                )
8839            else:
8840                for ann_annotation in ann_header:
8841                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8842                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8843                        ann_annotation_id,
8844                        ".",
8845                        "String",
8846                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8847                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8848                        "howard calculation",
8849                        "0",
8850                        self.code_type_map.get("String"),
8851                    )
8852
8853            # Update
8854            sql_update = f"""
8855                UPDATE variants
8856                SET "INFO" = 
8857                    concat(
8858                        CASE
8859                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8860                            THEN ''
8861                            ELSE concat("INFO", ';')
8862                        END,
8863                        CASE 
8864                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8865                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8866                            THEN concat(
8867                                '{ann_annotations_prefix}',
8868                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8869                                )
8870                            ELSE ''
8871                        END
8872                    )
8873                FROM dataframe_snpeff_hgvs
8874                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8875
8876            """
8877            self.conn.execute(sql_update)
8878
8879            # Delete dataframe
8880            del dataframe_snpeff_hgvs
8881            gc.collect()
8882
8883        else:
8884
8885            log.warning(
8886                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8887            )
8888
8889        # Remove added columns
8890        for added_column in added_columns:
8891            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8893    def calculation_extract_nomen(self) -> None:
8894        """
8895        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8896        """
8897
8898        # NOMEN field
8899        field_nomen_dict = "NOMEN_DICT"
8900
8901        # NOMEN structure
8902        nomen_dict = {
8903            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8904            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8905            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8906            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8907            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8908            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8909            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8910            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8911            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8912            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8913        }
8914
8915        # Param
8916        param = self.get_param()
8917
8918        # Threads
8919        threads = self.get_threads()
8920
8921        # Prefix
8922        prefix = self.get_explode_infos_prefix()
8923
8924        # Header
8925        vcf_reader = self.get_header()
8926
8927        # Added columns
8928        added_columns = []
8929
8930        # Get HGVS field
8931        hgvs_field = (
8932            param.get("calculation", {})
8933            .get("calculations", {})
8934            .get("NOMEN", {})
8935            .get("options", {})
8936            .get("hgvs_field", "hgvs")
8937        )
8938
8939        # Get NOMEN pattern
8940        nomen_pattern = (
8941            param.get("calculation", {})
8942            .get("calculations", {})
8943            .get("NOMEN", {})
8944            .get("options", {})
8945            .get("pattern", None)
8946        )
8947
8948        # transcripts list of preference sources
8949        transcripts_sources = {}
8950
8951        # Get transcripts
8952        transcripts_file = (
8953            param.get("calculation", {})
8954            .get("calculations", {})
8955            .get("NOMEN", {})
8956            .get("options", {})
8957            .get("transcripts", None)
8958        )
8959        transcripts_file = full_path(transcripts_file)
8960        if transcripts_file:
8961            if os.path.exists(transcripts_file):
8962                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8963                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
8964                transcripts_sources["file"] = transcripts_from_file
8965            else:
8966                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
8967                log.error(msg_err)
8968                raise ValueError(msg_err)
8969
8970        # Get transcripts table
8971        transcripts_table = (
8972            param.get("calculation", {})
8973            .get("calculations", {})
8974            .get("NOMEN", {})
8975            .get("options", {})
8976            .get("transcripts_table", self.get_table_variants())
8977        )
8978        # Get transcripts column
8979        transcripts_column = (
8980            param.get("calculation", {})
8981            .get("calculations", {})
8982            .get("NOMEN", {})
8983            .get("options", {})
8984            .get("transcripts_column", None)
8985        )
8986
8987        if transcripts_table and transcripts_column:
8988            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
8989            # Explode if not exists
8990            added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table)
8991        else:
8992            extra_field_transcript = f"NULL"
8993
8994        # Transcripts of preference source order
8995        transcripts_order = (
8996            param.get("calculation", {})
8997            .get("calculations", {})
8998            .get("NOMEN", {})
8999            .get("options", {})
9000            .get("transcripts_order", ["column", "file"])
9001        )
9002
9003        # Transcripts from file
9004        transcripts = transcripts_sources.get("file", [])
9005
9006        # Explode HGVS field in column
9007        added_columns += self.explode_infos(fields=[hgvs_field])
9008
9009        # extra infos
9010        extra_infos = self.get_extra_infos()
9011        extra_field = prefix + hgvs_field
9012
9013        if extra_field in extra_infos:
9014
9015            # Create dataframe
9016            dataframe_hgvs = self.get_query_to_df(
9017                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
9018            )
9019
9020            # Transcripts rank
9021            transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)}
9022            transcripts_len = len(transcripts_rank)
9023
9024            # Create main NOMEN column
9025            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
9026                lambda x: find_nomen(
9027                    hgvs=x.hgvs,
9028                    transcript=x.transcript,
9029                    transcripts=transcripts_rank,
9030                    pattern=nomen_pattern,
9031                    transcripts_source_order=transcripts_order,
9032                    transcripts_len=transcripts_len
9033                ),
9034                axis=1,
9035            )
9036
9037            # Explode NOMEN Structure and create SQL set for update
9038            sql_nomen_fields = []
9039            for nomen_field in nomen_dict:
9040
9041                # Create VCF header field
9042                vcf_reader.infos[nomen_field] = vcf.parser._Info(
9043                    nomen_field,
9044                    ".",
9045                    "String",
9046                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
9047                    "howard calculation",
9048                    "0",
9049                    self.code_type_map.get("String"),
9050                )
9051
9052                # Add field to SQL query update
9053                sql_nomen_fields.append(
9054                    f"""
9055                        CASE 
9056                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
9057                            THEN concat(
9058                                    ';{nomen_field}=',
9059                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
9060                                )
9061                            ELSE ''
9062                        END
9063                    """
9064                )
9065
9066            # SQL set for update
9067            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
9068
9069            # Update
9070            sql_update = f"""
9071                UPDATE variants
9072                SET "INFO" = 
9073                    concat(
9074                        CASE
9075                            WHEN "INFO" IS NULL
9076                            THEN ''
9077                            ELSE "INFO"
9078                        END,
9079                        {sql_nomen_fields_set}
9080                    )
9081                FROM dataframe_hgvs
9082                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
9083                    AND variants."POS" = dataframe_hgvs."POS" 
9084                    AND variants."REF" = dataframe_hgvs."REF"
9085                    AND variants."ALT" = dataframe_hgvs."ALT"
9086            """
9087            self.conn.execute(sql_update)
9088
9089            # Delete dataframe
9090            del dataframe_hgvs
9091            gc.collect()
9092
9093        # Remove added columns
9094        for added_column in added_columns:
9095            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
9097    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
9098        """
9099        The function `calculation_find_by_pipeline` performs a calculation to find the number of
9100        pipeline/sample for a variant and updates the variant information in a VCF file.
9101
9102        :param tag: The `tag` parameter is a string that represents the annotation field for the
9103        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
9104        VCF header and to update the corresponding field in the variants table, defaults to
9105        findbypipeline
9106        :type tag: str (optional)
9107        """
9108
9109        # if FORMAT and samples
9110        if (
9111            "FORMAT" in self.get_header_columns_as_list()
9112            and self.get_header_sample_list()
9113        ):
9114
9115            # findbypipeline annotation field
9116            findbypipeline_tag = tag
9117
9118            # VCF infos tags
9119            vcf_infos_tags = {
9120                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
9121            }
9122
9123            # Prefix
9124            prefix = self.get_explode_infos_prefix()
9125
9126            # Field
9127            findbypipeline_infos = prefix + findbypipeline_tag
9128
9129            # Variants table
9130            table_variants = self.get_table_variants()
9131
9132            # Header
9133            vcf_reader = self.get_header()
9134
9135            # Create variant id
9136            variant_id_column = self.get_variant_id_column()
9137            added_columns = [variant_id_column]
9138
9139            # variant_id, FORMAT and samples
9140            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9141                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9142            )
9143
9144            # Create dataframe
9145            dataframe_findbypipeline = self.get_query_to_df(
9146                f""" SELECT {samples_fields} FROM {table_variants} """
9147            )
9148
9149            # Create findbypipeline column
9150            dataframe_findbypipeline[findbypipeline_infos] = (
9151                dataframe_findbypipeline.apply(
9152                    lambda row: findbypipeline(
9153                        row, samples=self.get_header_sample_list()
9154                    ),
9155                    axis=1,
9156                )
9157            )
9158
9159            # Add snpeff_hgvs to header
9160            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
9161                findbypipeline_tag,
9162                ".",
9163                "String",
9164                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
9165                "howard calculation",
9166                "0",
9167                self.code_type_map.get("String"),
9168            )
9169
9170            # Update
9171            sql_update = f"""
9172                UPDATE variants
9173                SET "INFO" = 
9174                    concat(
9175                        CASE
9176                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9177                            THEN ''
9178                            ELSE concat("INFO", ';')
9179                        END,
9180                        CASE 
9181                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
9182                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
9183                            THEN concat(
9184                                    '{findbypipeline_tag}=',
9185                                    dataframe_findbypipeline."{findbypipeline_infos}"
9186                                )
9187                            ELSE ''
9188                        END
9189                    )
9190                FROM dataframe_findbypipeline
9191                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
9192            """
9193            self.conn.execute(sql_update)
9194
9195            # Remove added columns
9196            for added_column in added_columns:
9197                self.drop_column(column=added_column)
9198
9199            # Delete dataframe
9200            del dataframe_findbypipeline
9201            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
9203    def calculation_genotype_concordance(self) -> None:
9204        """
9205        The function `calculation_genotype_concordance` calculates the genotype concordance for
9206        multi-caller VCF files and updates the variant information in the database.
9207        """
9208
9209        # if FORMAT and samples
9210        if (
9211            "FORMAT" in self.get_header_columns_as_list()
9212            and self.get_header_sample_list()
9213        ):
9214
9215            # genotypeconcordance annotation field
9216            genotypeconcordance_tag = "genotypeconcordance"
9217
9218            # VCF infos tags
9219            vcf_infos_tags = {
9220                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
9221            }
9222
9223            # Prefix
9224            prefix = self.get_explode_infos_prefix()
9225
9226            # Field
9227            genotypeconcordance_infos = prefix + genotypeconcordance_tag
9228
9229            # Variants table
9230            table_variants = self.get_table_variants()
9231
9232            # Header
9233            vcf_reader = self.get_header()
9234
9235            # Create variant id
9236            variant_id_column = self.get_variant_id_column()
9237            added_columns = [variant_id_column]
9238
9239            # variant_id, FORMAT and samples
9240            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9241                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9242            )
9243
9244            # Create dataframe
9245            dataframe_genotypeconcordance = self.get_query_to_df(
9246                f""" SELECT {samples_fields} FROM {table_variants} """
9247            )
9248
9249            # Create genotypeconcordance column
9250            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
9251                dataframe_genotypeconcordance.apply(
9252                    lambda row: genotypeconcordance(
9253                        row, samples=self.get_header_sample_list()
9254                    ),
9255                    axis=1,
9256                )
9257            )
9258
9259            # Add genotypeconcordance to header
9260            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
9261                genotypeconcordance_tag,
9262                ".",
9263                "String",
9264                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
9265                "howard calculation",
9266                "0",
9267                self.code_type_map.get("String"),
9268            )
9269
9270            # Update
9271            sql_update = f"""
9272                UPDATE variants
9273                SET "INFO" = 
9274                    concat(
9275                        CASE
9276                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9277                            THEN ''
9278                            ELSE concat("INFO", ';')
9279                        END,
9280                        CASE
9281                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
9282                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
9283                            THEN concat(
9284                                    '{genotypeconcordance_tag}=',
9285                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
9286                                )
9287                            ELSE ''
9288                        END
9289                    )
9290                FROM dataframe_genotypeconcordance
9291                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
9292            """
9293            self.conn.execute(sql_update)
9294
9295            # Remove added columns
9296            for added_column in added_columns:
9297                self.drop_column(column=added_column)
9298
9299            # Delete dataframe
9300            del dataframe_genotypeconcordance
9301            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
9303    def calculation_barcode(self, tag: str = "barcode") -> None:
9304        """
9305        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
9306        updates the INFO field in the file with the calculated barcode values.
9307
9308        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
9309        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
9310        the default tag name is set to "barcode", defaults to barcode
9311        :type tag: str (optional)
9312        """
9313
9314        # if FORMAT and samples
9315        if (
9316            "FORMAT" in self.get_header_columns_as_list()
9317            and self.get_header_sample_list()
9318        ):
9319
9320            # barcode annotation field
9321            if not tag:
9322                tag = "barcode"
9323
9324            # VCF infos tags
9325            vcf_infos_tags = {
9326                tag: "barcode calculation (VaRank)",
9327            }
9328
9329            # Prefix
9330            prefix = self.get_explode_infos_prefix()
9331
9332            # Field
9333            barcode_infos = prefix + tag
9334
9335            # Variants table
9336            table_variants = self.get_table_variants()
9337
9338            # Header
9339            vcf_reader = self.get_header()
9340
9341            # Create variant id
9342            variant_id_column = self.get_variant_id_column()
9343            added_columns = [variant_id_column]
9344
9345            # variant_id, FORMAT and samples
9346            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9347                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9348            )
9349
9350            # Create dataframe
9351            dataframe_barcode = self.get_query_to_df(
9352                f""" SELECT {samples_fields} FROM {table_variants} """
9353            )
9354
9355            # Create barcode column
9356            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9357                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
9358            )
9359
9360            # Add barcode to header
9361            vcf_reader.infos[tag] = vcf.parser._Info(
9362                tag,
9363                ".",
9364                "String",
9365                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
9366                "howard calculation",
9367                "0",
9368                self.code_type_map.get("String"),
9369            )
9370
9371            # Update
9372            sql_update = f"""
9373                UPDATE {table_variants}
9374                SET "INFO" = 
9375                    concat(
9376                        CASE
9377                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9378                            THEN ''
9379                            ELSE concat("INFO", ';')
9380                        END,
9381                        CASE
9382                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
9383                            AND dataframe_barcode."{barcode_infos}" NOT NULL
9384                            THEN concat(
9385                                    '{tag}=',
9386                                    dataframe_barcode."{barcode_infos}"
9387                                )
9388                            ELSE ''
9389                        END
9390                    )
9391                FROM dataframe_barcode
9392                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9393            """
9394            self.conn.execute(sql_update)
9395
9396            # Remove added columns
9397            for added_column in added_columns:
9398                self.drop_column(column=added_column)
9399
9400            # Delete dataframe
9401            del dataframe_barcode
9402            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
9404    def calculation_barcode_family(self, tag: str = "BCF") -> None:
9405        """
9406        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
9407        and updates the INFO field in the file with the calculated barcode values.
9408
9409        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
9410        the barcode tag that will be added to the VCF file during the calculation process. If no value
9411        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
9412        :type tag: str (optional)
9413        """
9414
9415        # if FORMAT and samples
9416        if (
9417            "FORMAT" in self.get_header_columns_as_list()
9418            and self.get_header_sample_list()
9419        ):
9420
9421            # barcode annotation field
9422            if not tag:
9423                tag = "BCF"
9424
9425            # VCF infos tags
9426            vcf_infos_tags = {
9427                tag: "barcode family calculation",
9428                f"{tag}S": "barcode family samples",
9429            }
9430
9431            # Param
9432            param = self.get_param()
9433            log.debug(f"param={param}")
9434
9435            # Prefix
9436            prefix = self.get_explode_infos_prefix()
9437
9438            # PED param
9439            ped = (
9440                param.get("calculation", {})
9441                .get("calculations", {})
9442                .get("BARCODEFAMILY", {})
9443                .get("family_pedigree", None)
9444            )
9445            log.debug(f"ped={ped}")
9446
9447            # Load PED
9448            if ped:
9449
9450                # Pedigree is a file
9451                if isinstance(ped, str) and os.path.exists(full_path(ped)):
9452                    log.debug("Pedigree is file")
9453                    with open(full_path(ped)) as ped:
9454                        ped = yaml.safe_load(ped)
9455
9456                # Pedigree is a string
9457                elif isinstance(ped, str):
9458                    log.debug("Pedigree is str")
9459                    try:
9460                        ped = json.loads(ped)
9461                        log.debug("Pedigree is json str")
9462                    except ValueError as e:
9463                        ped_samples = ped.split(",")
9464                        ped = {}
9465                        for ped_sample in ped_samples:
9466                            ped[ped_sample] = ped_sample
9467
9468                # Pedigree is a dict
9469                elif isinstance(ped, dict):
9470                    log.debug("Pedigree is dict")
9471
9472                # Pedigree is not well formatted
9473                else:
9474                    msg_error = "Pedigree not well formatted"
9475                    log.error(msg_error)
9476                    raise ValueError(msg_error)
9477
9478                # Construct list
9479                ped_samples = list(ped.values())
9480
9481            else:
9482                log.debug("Pedigree not defined. Take all samples")
9483                ped_samples = self.get_header_sample_list()
9484                ped = {}
9485                for ped_sample in ped_samples:
9486                    ped[ped_sample] = ped_sample
9487
9488            # Check pedigree
9489            if not ped or len(ped) == 0:
9490                msg_error = f"Error in pedigree: samples {ped_samples}"
9491                log.error(msg_error)
9492                raise ValueError(msg_error)
9493
9494            # Log
9495            log.info(
9496                "Calculation 'BARCODEFAMILY' - Samples: "
9497                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
9498            )
9499            log.debug(f"ped_samples={ped_samples}")
9500
9501            # Field
9502            barcode_infos = prefix + tag
9503
9504            # Variants table
9505            table_variants = self.get_table_variants()
9506
9507            # Header
9508            vcf_reader = self.get_header()
9509
9510            # Create variant id
9511            variant_id_column = self.get_variant_id_column()
9512            added_columns = [variant_id_column]
9513
9514            # variant_id, FORMAT and samples
9515            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9516                [f""" "{sample}" """ for sample in ped_samples]
9517            )
9518
9519            # Create dataframe
9520            dataframe_barcode = self.get_query_to_df(
9521                f""" SELECT {samples_fields} FROM {table_variants} """
9522            )
9523
9524            # Create barcode column
9525            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9526                lambda row: barcode(row, samples=ped_samples), axis=1
9527            )
9528
9529            # Add barcode family to header
9530            # Add vaf_normalization to header
9531            vcf_reader.formats[tag] = vcf.parser._Format(
9532                id=tag,
9533                num=".",
9534                type="String",
9535                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
9536                type_code=self.code_type_map.get("String"),
9537            )
9538            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
9539                id=f"{tag}S",
9540                num=".",
9541                type="String",
9542                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
9543                type_code=self.code_type_map.get("String"),
9544            )
9545
9546            # Update
9547            # for sample in ped_samples:
9548            sql_update_set = []
9549            for sample in self.get_header_sample_list() + ["FORMAT"]:
9550                if sample in ped_samples:
9551                    value = f'dataframe_barcode."{barcode_infos}"'
9552                    value_samples = "'" + ",".join([f""" "{sample}" """ for sample in ped_samples]) + "'"
9553                    ped_samples
9554                elif sample == "FORMAT":
9555                    value = f"'{tag}'"
9556                    value_samples = f"'{tag}S'"
9557                else:
9558                    value = "'.'"
9559                    value_samples = "'.'"
9560                format_regex = r"[a-zA-Z0-9\s]"
9561                sql_update_set.append(
9562                    f"""
9563                        "{sample}" = 
9564                        concat(
9565                            CASE
9566                                WHEN {table_variants}."{sample}" = './.'
9567                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
9568                                ELSE {table_variants}."{sample}"
9569                            END,
9570                            ':',
9571                            {value},
9572                            ':',
9573                            {value_samples}
9574                        )
9575                    """
9576                )
9577
9578            sql_update_set_join = ", ".join(sql_update_set)
9579            sql_update = f"""
9580                UPDATE {table_variants}
9581                SET {sql_update_set_join}
9582                FROM dataframe_barcode
9583                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9584            """
9585            self.conn.execute(sql_update)
9586
9587            # Remove added columns
9588            for added_column in added_columns:
9589                self.drop_column(column=added_column)
9590
9591            # Delete dataframe
9592            del dataframe_barcode
9593            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
9595    def calculation_trio(self) -> None:
9596        """
9597        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
9598        information to the INFO field of each variant.
9599        """
9600
9601        # if FORMAT and samples
9602        if (
9603            "FORMAT" in self.get_header_columns_as_list()
9604            and self.get_header_sample_list()
9605        ):
9606
9607            # trio annotation field
9608            trio_tag = "trio"
9609
9610            # VCF infos tags
9611            vcf_infos_tags = {
9612                "trio": "trio calculation",
9613            }
9614
9615            # Param
9616            param = self.get_param()
9617
9618            # Prefix
9619            prefix = self.get_explode_infos_prefix()
9620
9621            # Trio param
9622            trio_ped = (
9623                param.get("calculation", {})
9624                .get("calculations", {})
9625                .get("TRIO", {})
9626                .get("trio_pedigree", None)
9627            )
9628
9629            # Load trio
9630            if trio_ped:
9631
9632                # Trio pedigree is a file
9633                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
9634                    log.debug("TRIO pedigree is file")
9635                    with open(full_path(trio_ped)) as trio_ped:
9636                        trio_ped = yaml.safe_load(trio_ped)
9637
9638                # Trio pedigree is a string
9639                elif isinstance(trio_ped, str):
9640                    log.debug("TRIO pedigree is str")
9641                    try:
9642                        trio_ped = json.loads(trio_ped)
9643                        log.debug("TRIO pedigree is json str")
9644                    except ValueError as e:
9645                        trio_samples = trio_ped.split(",")
9646                        if len(trio_samples) == 3:
9647                            trio_ped = {
9648                                "father": trio_samples[0],
9649                                "mother": trio_samples[1],
9650                                "child": trio_samples[2],
9651                            }
9652                            log.debug("TRIO pedigree is list str")
9653                        else:
9654                            msg_error = "TRIO pedigree not well formatted"
9655                            log.error(msg_error)
9656                            raise ValueError(msg_error)
9657
9658                # Trio pedigree is a dict
9659                elif isinstance(trio_ped, dict):
9660                    log.debug("TRIO pedigree is dict")
9661
9662                # Trio pedigree is not well formatted
9663                else:
9664                    msg_error = "TRIO pedigree not well formatted"
9665                    log.error(msg_error)
9666                    raise ValueError(msg_error)
9667
9668                # Construct trio list
9669                trio_samples = [
9670                    trio_ped.get("father", ""),
9671                    trio_ped.get("mother", ""),
9672                    trio_ped.get("child", ""),
9673                ]
9674
9675            else:
9676                log.debug("TRIO pedigree not defined. Take the first 3 samples")
9677                samples_list = self.get_header_sample_list()
9678                if len(samples_list) >= 3:
9679                    trio_samples = self.get_header_sample_list()[0:3]
9680                    trio_ped = {
9681                        "father": trio_samples[0],
9682                        "mother": trio_samples[1],
9683                        "child": trio_samples[2],
9684                    }
9685                else:
9686                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
9687                    log.error(msg_error)
9688                    raise ValueError(msg_error)
9689
9690            # Check trio pedigree
9691            if not trio_ped or len(trio_ped) != 3:
9692                msg_error = f"Error in TRIO pedigree: {trio_ped}"
9693                log.error(msg_error)
9694                raise ValueError(msg_error)
9695
9696            # Log
9697            log.info(
9698                f"Calculation 'TRIO' - Samples: "
9699                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
9700            )
9701
9702            # Field
9703            trio_infos = prefix + trio_tag
9704
9705            # Variants table
9706            table_variants = self.get_table_variants()
9707
9708            # Header
9709            vcf_reader = self.get_header()
9710
9711            # Create variant id
9712            variant_id_column = self.get_variant_id_column()
9713            added_columns = [variant_id_column]
9714
9715            # variant_id, FORMAT and samples
9716            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9717                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9718            )
9719
9720            # Create dataframe
9721            dataframe_trio = self.get_query_to_df(
9722                f""" SELECT {samples_fields} FROM {table_variants} """
9723            )
9724
9725            # Create trio column
9726            dataframe_trio[trio_infos] = dataframe_trio.apply(
9727                lambda row: trio(row, samples=trio_samples), axis=1
9728            )
9729
9730            # Add trio to header
9731            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9732                trio_tag,
9733                ".",
9734                "String",
9735                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9736                "howard calculation",
9737                "0",
9738                self.code_type_map.get("String"),
9739            )
9740
9741            # Update
9742            sql_update = f"""
9743                UPDATE {table_variants}
9744                SET "INFO" = 
9745                    concat(
9746                        CASE
9747                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9748                            THEN ''
9749                            ELSE concat("INFO", ';')
9750                        END,
9751                        CASE
9752                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9753                             AND dataframe_trio."{trio_infos}" NOT NULL
9754                            THEN concat(
9755                                    '{trio_tag}=',
9756                                    dataframe_trio."{trio_infos}"
9757                                )
9758                            ELSE ''
9759                        END
9760                    )
9761                FROM dataframe_trio
9762                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9763            """
9764            self.conn.execute(sql_update)
9765
9766            # Remove added columns
9767            for added_column in added_columns:
9768                self.drop_column(column=added_column)
9769
9770            # Delete dataframe
9771            del dataframe_trio
9772            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9774    def calculation_vaf_normalization(self) -> None:
9775        """
9776        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9777        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9778        :return: The function does not return anything.
9779        """
9780
9781        # if FORMAT and samples
9782        if (
9783            "FORMAT" in self.get_header_columns_as_list()
9784            and self.get_header_sample_list()
9785        ):
9786
9787            # vaf_normalization annotation field
9788            vaf_normalization_tag = "VAF"
9789
9790            # VCF infos tags
9791            vcf_infos_tags = {
9792                "VAF": "VAF Variant Frequency",
9793            }
9794
9795            # Prefix
9796            prefix = self.get_explode_infos_prefix()
9797
9798            # Variants table
9799            table_variants = self.get_table_variants()
9800
9801            # Header
9802            vcf_reader = self.get_header()
9803
9804            # Do not calculate if VAF already exists
9805            if "VAF" in vcf_reader.formats:
9806                log.debug("VAF already on genotypes")
9807                return
9808
9809            # Create variant id
9810            variant_id_column = self.get_variant_id_column()
9811            added_columns = [variant_id_column]
9812
9813            # variant_id, FORMAT and samples
9814            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9815                f""" "{sample}" """ for sample in self.get_header_sample_list()
9816            )
9817
9818            # Create dataframe
9819            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9820            log.debug(f"query={query}")
9821            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9822
9823            vaf_normalization_set = []
9824
9825            # for each sample vaf_normalization
9826            for sample in self.get_header_sample_list():
9827                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9828                    lambda row: vaf_normalization(row, sample=sample), axis=1
9829                )
9830                vaf_normalization_set.append(
9831                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9832                )
9833
9834            # Add VAF to FORMAT
9835            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9836                "FORMAT"
9837            ].apply(lambda x: str(x) + ":VAF")
9838            vaf_normalization_set.append(
9839                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9840            )
9841
9842            # Add vaf_normalization to header
9843            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9844                id=vaf_normalization_tag,
9845                num="1",
9846                type="Float",
9847                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9848                type_code=self.code_type_map.get("Float"),
9849            )
9850
9851            # Create fields to add in INFO
9852            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9853
9854            # Update
9855            sql_update = f"""
9856                UPDATE {table_variants}
9857                SET {sql_vaf_normalization_set}
9858                FROM dataframe_vaf_normalization
9859                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9860
9861            """
9862            self.conn.execute(sql_update)
9863
9864            # Remove added columns
9865            for added_column in added_columns:
9866                self.drop_column(column=added_column)
9867
9868            # Delete dataframe
9869            del dataframe_vaf_normalization
9870            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
 9872    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9873        """
 9874        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9875        field in a VCF file and updates the INFO column of the variants table with the calculated
 9876        statistics.
 9877
 9878        :param info: The `info` parameter is a string that represents the type of information for which
 9879        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9880        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9881        maximum value, the mean, the median, defaults to VAF
 9882        :type info: str (optional)
 9883        """
 9884
 9885        # if FORMAT and samples
 9886        if (
 9887            "FORMAT" in self.get_header_columns_as_list()
 9888            and self.get_header_sample_list()
 9889        ):
 9890
 9891            # vaf_stats annotation field
 9892            vaf_stats_tag = info + "_stats"
 9893
 9894            # VCF infos tags
 9895            vcf_infos_tags = {
 9896                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9897                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9898                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9899                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9900                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9901                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9902                info
 9903                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9904            }
 9905
 9906            # Prefix
 9907            prefix = self.get_explode_infos_prefix()
 9908
 9909            # Field
 9910            vaf_stats_infos = prefix + vaf_stats_tag
 9911
 9912            # Variants table
 9913            table_variants = self.get_table_variants()
 9914
 9915            # Header
 9916            vcf_reader = self.get_header()
 9917
 9918            # Create variant id
 9919            variant_id_column = self.get_variant_id_column()
 9920            added_columns = [variant_id_column]
 9921
 9922            # variant_id, FORMAT and samples
 9923            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9924                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9925            )
 9926
 9927            # Create dataframe
 9928            dataframe_vaf_stats = self.get_query_to_df(
 9929                f""" SELECT {samples_fields} FROM {table_variants} """
 9930            )
 9931
 9932            # Create vaf_stats column
 9933            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9934                lambda row: genotype_stats(
 9935                    row, samples=self.get_header_sample_list(), info=info
 9936                ),
 9937                axis=1,
 9938            )
 9939
 9940            # List of vcf tags
 9941            sql_vaf_stats_fields = []
 9942
 9943            # Check all VAF stats infos
 9944            for stat in vcf_infos_tags:
 9945
 9946                # Extract stats
 9947                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9948                    lambda x: dict(x).get(stat, "")
 9949                )
 9950
 9951                # Add snpeff_hgvs to header
 9952                vcf_reader.infos[stat] = vcf.parser._Info(
 9953                    stat,
 9954                    ".",
 9955                    "String",
 9956                    vcf_infos_tags.get(stat, "genotype statistics"),
 9957                    "howard calculation",
 9958                    "0",
 9959                    self.code_type_map.get("String"),
 9960                )
 9961
 9962                if len(sql_vaf_stats_fields):
 9963                    sep = ";"
 9964                else:
 9965                    sep = ""
 9966
 9967                # Create fields to add in INFO
 9968                sql_vaf_stats_fields.append(
 9969                    f"""
 9970                        CASE
 9971                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9972                            THEN concat(
 9973                                    '{sep}{stat}=',
 9974                                    dataframe_vaf_stats."{stat}"
 9975                                )
 9976                            ELSE ''
 9977                        END
 9978                    """
 9979                )
 9980
 9981            # SQL set for update
 9982            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9983
 9984            # Update
 9985            sql_update = f"""
 9986                UPDATE {table_variants}
 9987                SET "INFO" = 
 9988                    concat(
 9989                        CASE
 9990                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9991                            THEN ''
 9992                            ELSE concat("INFO", ';')
 9993                        END,
 9994                        {sql_vaf_stats_fields_set}
 9995                    )
 9996                FROM dataframe_vaf_stats
 9997                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9998
 9999            """
10000            self.conn.execute(sql_update)
10001
10002            # Remove added columns
10003            for added_column in added_columns:
10004                self.drop_column(column=added_column)
10005
10006            # Delete dataframe
10007            del dataframe_vaf_stats
10008            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_annotation(self, info_json: str = None, info_format: str = None) -> None:
10010    def calculation_transcripts_annotation(
10011        self, info_json: str = None, info_format: str = None
10012    ) -> None:
10013        """
10014        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10015        field to it if transcripts are available.
10016
10017        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10018        is a string parameter that represents the information field to be used in the transcripts JSON.
10019        It is used to specify the JSON format for the transcripts information. If no value is provided
10020        when calling the method, it defaults to "
10021        :type info_json: str
10022        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10023        method is a string parameter that specifies the format of the information field to be used in
10024        the transcripts JSON. It is used to define the format of the information field
10025        :type info_format: str
10026        """
10027
10028        # Create transcripts table
10029        transcripts_table = self.create_transcript_view()
10030
10031        # Add info field
10032        if transcripts_table:
10033            self.transcript_view_to_variants(
10034                transcripts_table=transcripts_table,
10035                transcripts_info_field_json=info_json,
10036                transcripts_info_field_format=info_format,
10037            )
10038        else:
10039            log.info("No Transcripts to process. Check param.json file configuration")

The calculation_transcripts_annotation function creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info_json: The info_json parameter in the calculation_transcripts_annotation method is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to "
  • info_format: The info_format parameter in the calculation_transcripts_annotation method is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
def calculation_transcripts_prioritization(self) -> None:
10041    def calculation_transcripts_prioritization(self) -> None:
10042        """
10043        The function `calculation_transcripts_prioritization` creates a transcripts table and
10044        prioritizes transcripts based on certain criteria.
10045        """
10046
10047        # Create transcripts table
10048        transcripts_table = self.create_transcript_view()
10049
10050        # Add info field
10051        if transcripts_table:
10052            self.transcripts_prioritization(transcripts_table=transcripts_table)
10053        else:
10054            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def calculation_transcripts_export(self) -> None:
10056    def calculation_transcripts_export(self) -> None:
10057        """ """
10058
10059        # Create transcripts table
10060        transcripts_table = self.create_transcript_view()
10061
10062        # Add info field
10063        if transcripts_table:
10064            self.transcripts_export(transcripts_table=transcripts_table)
10065        else:
10066            log.info("No Transcripts to process. Check param.json file configuration")
def transcripts_export(self, transcripts_table: str = None, param: dict = {}) -> bool:
10072    def transcripts_export(
10073        self, transcripts_table: str = None, param: dict = {}
10074    ) -> bool:
10075        """ """
10076
10077        log.debug("Start transcripts export...")
10078
10079        # Param
10080        if not param:
10081            param = self.get_param()
10082
10083        # Param export
10084        param_transcript_export = param.get("transcripts", {}).get("export", {})
10085
10086        # Output file
10087        transcripts_export_output = param_transcript_export.get("output", None)
10088
10089        if not param_transcript_export or not transcripts_export_output:
10090            log.warning(f"No transcriipts export parameters defined!")
10091            return False
10092
10093        # List of transcripts annotations
10094        query_describe = f"""
10095            SELECT column_name
10096            FROM (
10097                    DESCRIBE SELECT * FROM {transcripts_table}
10098                )
10099            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10100        """
10101        transcripts_annotations_list = list(
10102            self.get_query_to_df(query=query_describe)["column_name"]
10103        )
10104
10105        # Create transcripts table for export
10106        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10107            random.choices(string.ascii_uppercase + string.digits, k=10)
10108        )
10109        query_create_transcripts_table_export = f"""
10110            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10111        """
10112        self.execute_query(query=query_create_transcripts_table_export)
10113
10114        # Output file format
10115        transcripts_export_output_format = get_file_format(
10116            filename=transcripts_export_output
10117        )
10118
10119        # Format VCF - construct INFO
10120        if transcripts_export_output_format in ["vcf"]:
10121
10122            # Construct query update INFO and header
10123            query_update_info = []
10124            for field in transcripts_annotations_list:
10125
10126                # If field not in header
10127                if field not in self.get_header_infos_list():
10128
10129                    # Add PZ Transcript in header
10130                    self.get_header().infos[field] = vcf.parser._Info(
10131                        field,
10132                        ".",
10133                        "String",
10134                        f"Annotation '{field}' from transcript view",
10135                        "unknown",
10136                        "unknown",
10137                        0,
10138                    )
10139
10140                # Add field as INFO/tag
10141                query_update_info.append(
10142                    f"""
10143                        CASE
10144                            WHEN "{field}" IS NOT NULL
10145                            THEN concat('{field}=', "{field}", ';')    
10146                            ELSE ''     
10147                        END
10148                        """
10149                )
10150
10151            # Query param
10152            query_update_info_value = (
10153                f""" concat('',  {", ".join(query_update_info)}) """
10154            )
10155            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10156
10157        else:
10158
10159            # Query param
10160            query_update_info_value = f""" NULL """
10161            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10162
10163        # Update query INFO column
10164        query_update = f"""
10165            UPDATE {transcripts_table_export}
10166            SET INFO = {query_update_info_value}
10167
10168        """
10169        self.execute_query(query=query_update)
10170
10171        # Export
10172        self.export_output(
10173            output_file=transcripts_export_output,
10174            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10175        )
10176
10177        # Drop transcripts export table
10178        query_drop_transcripts_table_export = f"""
10179            DROP TABLE {transcripts_table_export}
10180        """
10181        self.execute_query(query=query_drop_transcripts_table_export)
def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
10183    def transcripts_prioritization(
10184        self, transcripts_table: str = None, param: dict = {}
10185    ) -> bool:
10186        """
10187        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10188        and updates the variants table with the prioritized information.
10189
10190        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10191        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10192        This parameter is used to identify the table where the transcripts data is stored for the
10193        prioritization process
10194        :type transcripts_table: str
10195        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10196        that contains various configuration settings for the prioritization process of transcripts. It
10197        is used to customize the behavior of the prioritization algorithm and includes settings such as
10198        the prefix for prioritization fields, default profiles, and other
10199        :type param: dict
10200        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10201        transcripts prioritization process is successfully completed, and `False` if there are any
10202        issues or if no profile is defined for transcripts prioritization.
10203        """
10204
10205        log.debug("Start transcripts prioritization...")
10206
10207        # Param
10208        if not param:
10209            param = self.get_param()
10210
10211        # Variants table
10212        table_variants = self.get_table_variants()
10213
10214        # Transcripts table
10215        if transcripts_table is None:
10216            transcripts_table = self.create_transcript_view(
10217                transcripts_table="transcripts", param=param
10218            )
10219        if transcripts_table is None:
10220            msg_err = "No Transcripts table availalble"
10221            log.error(msg_err)
10222            raise ValueError(msg_err)
10223        log.debug(f"transcripts_table={transcripts_table}")
10224
10225        # Get transcripts columns
10226        columns_as_list_query = f"""
10227            DESCRIBE {transcripts_table}
10228        """
10229        columns_as_list = list(
10230            self.get_query_to_df(columns_as_list_query)["column_name"]
10231        )
10232
10233        # Create INFO if not exists
10234        if "INFO" not in columns_as_list:
10235            query_add_info = f"""
10236                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10237            """
10238            self.execute_query(query_add_info)
10239
10240        # Prioritization param and Force only PZ Score and Flag
10241        pz_param = param.get("transcripts", {}).get("prioritization", {})
10242
10243        # PZ profile by default
10244        pz_profile_default = (
10245            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10246        )
10247
10248        # Exit if no profile
10249        if pz_profile_default is None:
10250            log.warning("No profile defined for transcripts prioritization")
10251            return False
10252
10253        # PZ fields
10254        pz_param_pzfields = {}
10255
10256        # PZ field transcripts
10257        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10258
10259        # Add PZ Transcript in header
10260        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10261            pz_fields_transcripts,
10262            ".",
10263            "String",
10264            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10265            "unknown",
10266            "unknown",
10267            code_type_map["String"],
10268        )
10269
10270        # Mandatory fields
10271        pz_mandatory_fields_list = [
10272            "Score",
10273            "Flag",
10274            "Tags",
10275            "Comment",
10276            "Infos",
10277            "Class",
10278        ]
10279        pz_mandatory_fields = []
10280        for pz_mandatory_field in pz_mandatory_fields_list:
10281            pz_mandatory_fields.append(
10282                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10283            )
10284
10285        # PZ fields in param
10286        for pz_field in pz_param.get("pzfields", []):
10287            if pz_field in pz_mandatory_fields_list:
10288                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10289                    pz_param.get("pzprefix", "PTZ") + pz_field
10290                )
10291            else:
10292                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10293                pz_param_pzfields[pz_field] = pz_field_new
10294
10295                # Add PZ Transcript in header
10296                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10297                    pz_field_new,
10298                    ".",
10299                    "String",
10300                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10301                    "unknown",
10302                    "unknown",
10303                    code_type_map["String"],
10304                )
10305
10306        # PZ fields param
10307        pz_param["pzfields"] = pz_mandatory_fields
10308
10309        # Prioritization
10310        prioritization_result = self.prioritization(
10311            table=transcripts_table,
10312            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10313        )
10314        if not prioritization_result:
10315            log.warning("Transcripts prioritization not processed")
10316            return False
10317
10318        # PZ fields sql query
10319        query_update_select_list = []
10320        query_update_concat_list = []
10321        query_update_order_list = []
10322        for pz_param_pzfield in set(
10323            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10324        ):
10325            query_update_select_list.append(f" {pz_param_pzfield}, ")
10326
10327        for pz_param_pzfield in pz_param_pzfields:
10328            query_update_concat_list.append(
10329                f"""
10330                    , CASE 
10331                        WHEN {pz_param_pzfield} IS NOT NULL
10332                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10333                        ELSE ''
10334                    END
10335                """
10336            )
10337
10338        # Order by
10339        pz_orders = (
10340            param.get("transcripts", {})
10341            .get("prioritization", {})
10342            .get("prioritization_transcripts_order", {})
10343        )
10344        if not pz_orders:
10345            pz_orders = {
10346                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10347                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10348            }
10349        for pz_order in pz_orders:
10350            query_update_order_list.append(
10351                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10352            )
10353
10354        # Fields to explode
10355        fields_to_explode = (
10356            list(pz_param_pzfields.keys())
10357            + pz_mandatory_fields
10358            + list(pz_orders.keys())
10359        )
10360        # Remove transcript column as a specific transcript column
10361        if "transcript" in fields_to_explode:
10362            fields_to_explode.remove("transcript")
10363
10364        # Fields intranscripts table
10365        query_transcripts_table = f"""
10366            DESCRIBE SELECT * FROM {transcripts_table}
10367        """
10368        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10369
10370        # Check fields to explode
10371        for field_to_explode in fields_to_explode:
10372            if field_to_explode not in self.get_header_infos_list() + list(
10373                query_transcripts_table.column_name
10374            ):
10375                msg_err = f"INFO/{field_to_explode} NOT IN header"
10376                log.error(msg_err)
10377                raise ValueError(msg_err)
10378
10379        # Explode fields to explode
10380        self.explode_infos(
10381            table=transcripts_table,
10382            fields=fields_to_explode,
10383        )
10384
10385        # Transcript preference file
10386        transcripts_preference_file = (
10387            param.get("transcripts", {})
10388            .get("prioritization", {})
10389            .get("prioritization_transcripts", {})
10390        )
10391        transcripts_preference_file = full_path(transcripts_preference_file)
10392
10393        # Transcript preference forced
10394        transcript_preference_force = (
10395            param.get("transcripts", {})
10396            .get("prioritization", {})
10397            .get("prioritization_transcripts_force", False)
10398        )
10399        # Transcript version forced
10400        transcript_version_force = (
10401            param.get("transcripts", {})
10402            .get("prioritization", {})
10403            .get("prioritization_transcripts_version_force", False)
10404        )
10405
10406        # Transcripts Ranking
10407        if transcripts_preference_file:
10408
10409            # Transcripts file to dataframe
10410            if os.path.exists(transcripts_preference_file):
10411                transcripts_preference_dataframe = transcripts_file_to_df(
10412                    transcripts_preference_file
10413                )
10414            else:
10415                log.error(
10416                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10417                )
10418                raise ValueError(
10419                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10420                )
10421
10422            # Order by depending to transcript preference forcing
10423            if transcript_preference_force:
10424                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10425            else:
10426                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10427
10428            # Transcript columns joined depend on version consideration
10429            if transcript_version_force:
10430                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10431            else:
10432                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10433
10434            # Query ranking for update
10435            query_update_ranking = f"""
10436                SELECT
10437                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10438                    ROW_NUMBER() OVER (
10439                        PARTITION BY "#CHROM", POS, REF, ALT
10440                        ORDER BY {order_by}
10441                    ) AS rn
10442                FROM {transcripts_table}
10443                LEFT JOIN 
10444                    (
10445                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10446                        FROM transcripts_preference_dataframe
10447                    ) AS transcripts_preference
10448                ON {transcripts_version_join}
10449            """
10450
10451        else:
10452
10453            # Query ranking for update
10454            query_update_ranking = f"""
10455                SELECT
10456                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10457                    ROW_NUMBER() OVER (
10458                        PARTITION BY "#CHROM", POS, REF, ALT
10459                        ORDER BY {" , ".join(query_update_order_list)}
10460                    ) AS rn
10461                FROM {transcripts_table}
10462            """
10463
10464        # Export Transcripts prioritization infos to variants table
10465        query_update = f"""
10466            WITH RankedTranscripts AS (
10467                {query_update_ranking}
10468            )
10469            UPDATE {table_variants}
10470                SET
10471                INFO = CONCAT(CASE
10472                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10473                            THEN ''
10474                            ELSE concat("INFO", ';')
10475                        END,
10476                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10477                        )
10478            FROM
10479                RankedTranscripts
10480            WHERE
10481                rn = 1
10482                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10483                AND variants."POS" = RankedTranscripts."POS"
10484                AND variants."REF" = RankedTranscripts."REF"
10485                AND variants."ALT" = RankedTranscripts."ALT"     
10486        """
10487
10488        # log.debug(f"query_update={query_update}")
10489        self.execute_query(query=query_update)
10490
10491        # Return
10492        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10494    def create_transcript_view_from_columns_map(
10495        self,
10496        transcripts_table: str = "transcripts",
10497        columns_maps: dict = {},
10498        added_columns: list = [],
10499        temporary_tables: list = None,
10500        annotation_fields: list = None,
10501        column_rename: dict = {},
10502        column_clean: bool = False,
10503        column_case: str = None,
10504    ) -> tuple[list, list, list]:
10505        """
10506        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10507        specified columns mapping for transcripts data.
10508
10509        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10510        of the table where the transcripts data is stored or will be stored in the database. This table
10511        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10512        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10513        :type transcripts_table: str (optional)
10514        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10515        about how to map columns from a transcripts table to create a view. Each entry in the
10516        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10517        typically includes details such as the main transcript column and additional information columns
10518        :type columns_maps: dict
10519        :param added_columns: The `added_columns` parameter in the
10520        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10521        that will be added to the view being created based on the columns map provided. These columns
10522        are generated by exploding the transcript information columns along with the main transcript
10523        column
10524        :type added_columns: list
10525        :param temporary_tables: The `temporary_tables` parameter in the
10526        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10527        tables created during the process of creating a transcript view from a columns map. These
10528        temporary tables are used to store intermediate results or transformations before the final view
10529        is generated
10530        :type temporary_tables: list
10531        :param annotation_fields: The `annotation_fields` parameter in the
10532        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10533        used for annotation in the query view creation process. These fields are extracted from the
10534        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10535        :type annotation_fields: list
10536        :param column_rename: The `column_rename` parameter in the
10537        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10538        custom renaming for columns during the creation of the temporary table view. This parameter
10539        provides a mapping of original column names to the desired renamed column names. By using this
10540        parameter,
10541        :type column_rename: dict
10542        :param column_clean: The `column_clean` parameter in the
10543        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10544        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10545        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10546        False
10547        :type column_clean: bool (optional)
10548        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10549        function is used to specify the case transformation to be applied to the columns during the view
10550        creation process. It allows you to control whether the column values should be converted to
10551        lowercase, uppercase, or remain unchanged
10552        :type column_case: str
10553        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10554        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10555        """
10556
10557        log.debug("Start transcrpts view creation from columns map...")
10558
10559        # "from_columns_map": [
10560        #     {
10561        #         "transcripts_column": "Ensembl_transcriptid",
10562        #         "transcripts_infos_columns": [
10563        #             "genename",
10564        #             "Ensembl_geneid",
10565        #             "LIST_S2_score",
10566        #             "LIST_S2_pred",
10567        #         ],
10568        #     },
10569        #     {
10570        #         "transcripts_column": "Ensembl_transcriptid",
10571        #         "transcripts_infos_columns": [
10572        #             "genename",
10573        #             "VARITY_R_score",
10574        #             "Aloft_pred",
10575        #         ],
10576        #     },
10577        # ],
10578
10579        # Init
10580        if temporary_tables is None:
10581            temporary_tables = []
10582        if annotation_fields is None:
10583            annotation_fields = []
10584
10585        # Variants table
10586        table_variants = self.get_table_variants()
10587
10588        for columns_map in columns_maps:
10589
10590            # Transcript column
10591            transcripts_column = columns_map.get("transcripts_column", None)
10592
10593            # Transcripts infos columns
10594            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10595
10596            # Transcripts infos columns rename
10597            column_rename = columns_map.get("column_rename", column_rename)
10598
10599            # Transcripts infos columns clean
10600            column_clean = columns_map.get("column_clean", column_clean)
10601
10602            # Transcripts infos columns case
10603            column_case = columns_map.get("column_case", column_case)
10604
10605            if transcripts_column is not None:
10606
10607                # Explode
10608                added_columns += self.explode_infos(
10609                    fields=[transcripts_column] + transcripts_infos_columns
10610                )
10611
10612                # View clauses
10613                clause_select_variants = []
10614                clause_select_tanscripts = []
10615                for field in [transcripts_column] + transcripts_infos_columns:
10616
10617                    # AS field
10618                    as_field = field
10619
10620                    # Rename
10621                    if column_rename:
10622                        as_field = column_rename.get(as_field, as_field)
10623
10624                    # Clean
10625                    if column_clean:
10626                        as_field = clean_annotation_field(as_field)
10627
10628                    # Case
10629                    if column_case:
10630                        if column_case.lower() in ["lower"]:
10631                            as_field = as_field.lower()
10632                        elif column_case.lower() in ["upper"]:
10633                            as_field = as_field.upper()
10634
10635                    # Clause select Variants
10636                    clause_select_variants.append(
10637                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10638                    )
10639
10640                    if field in [transcripts_column]:
10641                        clause_select_tanscripts.append(
10642                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10643                        )
10644                    else:
10645                        clause_select_tanscripts.append(
10646                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10647                        )
10648                        annotation_fields.append(as_field)
10649
10650                # Querey View
10651                query = f""" 
10652                    SELECT
10653                        "#CHROM", POS, REF, ALT, INFO,
10654                        "{transcripts_column}" AS 'transcript',
10655                        {", ".join(clause_select_tanscripts)}
10656                    FROM (
10657                        SELECT 
10658                            "#CHROM", POS, REF, ALT, INFO,
10659                            {", ".join(clause_select_variants)}
10660                        FROM {table_variants}
10661                        )
10662                    WHERE "{transcripts_column}" IS NOT NULL
10663                """
10664
10665                # Create temporary table
10666                temporary_table = transcripts_table + "".join(
10667                    random.choices(string.ascii_uppercase + string.digits, k=10)
10668                )
10669
10670                # Temporary_tables
10671                temporary_tables.append(temporary_table)
10672                query_view = f"""
10673                    CREATE TEMPORARY TABLE {temporary_table}
10674                    AS ({query})
10675                """
10676                self.execute_query(query=query_view)
10677
10678        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
  • column_rename: The column_rename parameter in the create_transcript_view_from_columns_map function is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter,
  • column_clean: The column_clean parameter in the create_transcript_view_from_columns_map function is a boolean flag that determines whether the column values should be cleaned or not. If set to True, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_columns_map function is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns

The create_transcript_view_from_columns_map function returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10680    def create_transcript_view_from_column_format(
10681        self,
10682        transcripts_table: str = "transcripts",
10683        column_formats: dict = {},
10684        temporary_tables: list = None,
10685        annotation_fields: list = None,
10686        column_rename: dict = {},
10687        column_clean: bool = False,
10688        column_case: str = None,
10689    ) -> tuple[list, list, list]:
10690        """
10691        The `create_transcript_view_from_column_format` function generates a transcript view based on
10692        specified column formats, adds additional columns and annotation fields, and returns the list of
10693        temporary tables and annotation fields.
10694
10695        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10696        of the table containing the transcripts data. This table will be used as the base table for
10697        creating the transcript view. The default value for this parameter is "transcripts", but you can
10698        provide a different table name if needed, defaults to transcripts
10699        :type transcripts_table: str (optional)
10700        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10701        about the columns to be used for creating the transcript view. Each entry in the dictionary
10702        specifies the mapping between a transcripts column and a transcripts infos column. This
10703        parameter allows you to define how the columns from the transcripts table should be transformed
10704        or mapped
10705        :type column_formats: dict
10706        :param temporary_tables: The `temporary_tables` parameter in the
10707        `create_transcript_view_from_column_format` function is a list that stores the names of
10708        temporary views created during the process of creating a transcript view from a column format.
10709        These temporary views are used to manipulate and extract data before generating the final
10710        transcript view
10711        :type temporary_tables: list
10712        :param annotation_fields: The `annotation_fields` parameter in the
10713        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10714        that are extracted from the temporary views created during the process. These annotation fields
10715        are obtained by querying the temporary views and extracting the column names excluding specific
10716        columns like `#CH
10717        :type annotation_fields: list
10718        :param column_rename: The `column_rename` parameter in the
10719        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10720        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10721        column names to new column names in this dictionary, you can rename specific columns during the
10722        process
10723        :type column_rename: dict
10724        :param column_clean: The `column_clean` parameter in the
10725        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10726        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10727        will be cleaned during the creation of the transcript view based on the specified column format,
10728        defaults to False
10729        :type column_clean: bool (optional)
10730        :param column_case: The `column_case` parameter in the
10731        `create_transcript_view_from_column_format` function is used to specify the case transformation
10732        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10733        to convert the column names to uppercase or lowercase, respectively
10734        :type column_case: str
10735        :return: The `create_transcript_view_from_column_format` function returns two lists:
10736        `temporary_tables` and `annotation_fields`.
10737        """
10738
10739        log.debug("Start transcrpts view creation from column format...")
10740
10741        #  "from_column_format": [
10742        #     {
10743        #         "transcripts_column": "ANN",
10744        #         "transcripts_infos_column": "Feature_ID",
10745        #     }
10746        # ],
10747
10748        # Init
10749        if temporary_tables is None:
10750            temporary_tables = []
10751        if annotation_fields is None:
10752            annotation_fields = []
10753
10754        for column_format in column_formats:
10755
10756            # annotation field and transcript annotation field
10757            annotation_field = column_format.get("transcripts_column", "ANN")
10758            transcript_annotation = column_format.get(
10759                "transcripts_infos_column", "Feature_ID"
10760            )
10761
10762            # Transcripts infos columns rename
10763            column_rename = column_format.get("column_rename", column_rename)
10764
10765            # Transcripts infos columns clean
10766            column_clean = column_format.get("column_clean", column_clean)
10767
10768            # Transcripts infos columns case
10769            column_case = column_format.get("column_case", column_case)
10770
10771            # Temporary View name
10772            temporary_view_name = transcripts_table + "".join(
10773                random.choices(string.ascii_uppercase + string.digits, k=10)
10774            )
10775
10776            # Create temporary view name
10777            temporary_view_name = self.annotation_format_to_table(
10778                uniquify=True,
10779                annotation_field=annotation_field,
10780                view_name=temporary_view_name,
10781                annotation_id=transcript_annotation,
10782                column_rename=column_rename,
10783                column_clean=column_clean,
10784                column_case=column_case,
10785            )
10786
10787            # Annotation fields
10788            if temporary_view_name:
10789                query_annotation_fields = f"""
10790                    SELECT *
10791                    FROM (
10792                        DESCRIBE SELECT *
10793                        FROM {temporary_view_name}
10794                        )
10795                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10796                """
10797                df_annotation_fields = self.get_query_to_df(
10798                    query=query_annotation_fields
10799                )
10800
10801                # Add temporary view and annotation fields
10802                temporary_tables.append(temporary_view_name)
10803                annotation_fields += list(set(df_annotation_fields["column_name"]))
10804
10805        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
  • column_rename: The column_rename parameter in the create_transcript_view_from_column_format function is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process
  • column_clean: The column_clean parameter in the create_transcript_view_from_column_format function is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set to True, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_column_format function is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = False, param: dict = {}) -> str:
10807    def create_transcript_view(
10808        self,
10809        transcripts_table: str = None,
10810        transcripts_table_drop: bool = False,
10811        param: dict = {},
10812    ) -> str:
10813        """
10814        The `create_transcript_view` function generates a transcript view by processing data from a
10815        specified table based on provided parameters and structural information.
10816
10817        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10818        is used to specify the name of the table that will store the final transcript view data. If a table
10819        name is not provided, the function will create a new table to store the transcript view data, and by
10820        default,, defaults to transcripts
10821        :type transcripts_table: str (optional)
10822        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10823        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10824        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10825        the function will drop the existing transcripts table if it exists, defaults to False
10826        :type transcripts_table_drop: bool (optional)
10827        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10828        contains information needed to create a transcript view. It includes details such as the structure
10829        of the transcripts, columns mapping, column formats, and other necessary information for generating
10830        the view. This parameter allows for flexibility and customization
10831        :type param: dict
10832        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10833        created or modified during the execution of the function.
10834        """
10835
10836        log.debug("Start transcripts view creation...")
10837
10838        # Default
10839        transcripts_table_default = "transcripts"
10840
10841        # Param
10842        if not param:
10843            param = self.get_param()
10844
10845        # Struct
10846        struct = param.get("transcripts", {}).get("struct", None)
10847
10848        # Transcript veresion
10849        transcript_id_remove_version = param.get("transcripts", {}).get(
10850            "transcript_id_remove_version", False
10851        )
10852
10853        # Transcripts mapping
10854        transcript_id_mapping_file = param.get("transcripts", {}).get(
10855            "transcript_id_mapping_file", None
10856        )
10857
10858        # Transcripts mapping
10859        transcript_id_mapping_force = param.get("transcripts", {}).get(
10860            "transcript_id_mapping_force", None
10861        )
10862
10863        if struct:
10864
10865            # Transcripts table
10866            if transcripts_table is None:
10867                transcripts_table = param.get("transcripts", {}).get(
10868                    "table", transcripts_table_default
10869                )
10870
10871            # added_columns
10872            added_columns = []
10873
10874            # Temporary tables
10875            temporary_tables = []
10876
10877            # Annotation fields
10878            annotation_fields = []
10879
10880            # from columns map
10881            columns_maps = struct.get("from_columns_map", [])
10882            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10883                self.create_transcript_view_from_columns_map(
10884                    transcripts_table=transcripts_table,
10885                    columns_maps=columns_maps,
10886                    added_columns=added_columns,
10887                    temporary_tables=temporary_tables,
10888                    annotation_fields=annotation_fields,
10889                )
10890            )
10891            added_columns += added_columns_tmp
10892            temporary_tables += temporary_tables_tmp
10893            annotation_fields += annotation_fields_tmp
10894
10895            # from column format
10896            column_formats = struct.get("from_column_format", [])
10897            temporary_tables_tmp, annotation_fields_tmp = (
10898                self.create_transcript_view_from_column_format(
10899                    transcripts_table=transcripts_table,
10900                    column_formats=column_formats,
10901                    temporary_tables=temporary_tables,
10902                    annotation_fields=annotation_fields,
10903                )
10904            )
10905            temporary_tables += temporary_tables_tmp
10906            annotation_fields += annotation_fields_tmp
10907
10908            # Remove some specific fields/column
10909            annotation_fields = list(set(annotation_fields))
10910            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10911                if field in annotation_fields:
10912                    annotation_fields.remove(field)
10913
10914            # Merge temporary tables query
10915            query_merge = ""
10916            for temporary_table in list(set(temporary_tables)):
10917
10918                # First temporary table
10919                if not query_merge:
10920                    query_merge = f"""
10921                        SELECT * FROM {temporary_table}
10922                    """
10923                # other temporary table (using UNION)
10924                else:
10925                    query_merge += f"""
10926                        UNION BY NAME SELECT * FROM {temporary_table}
10927                    """
10928
10929            # transcript table tmp
10930            transcript_table_tmp = "transcripts_tmp"
10931            transcript_table_tmp2 = "transcripts_tmp2"
10932            transcript_table_tmp3 = "transcripts_tmp3"
10933
10934            # Merge on transcript
10935            query_merge_on_transcripts_annotation_fields = []
10936
10937            # Add transcript list
10938            query_merge_on_transcripts_annotation_fields.append(
10939                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10940            )
10941
10942            # Aggregate all annotations fields
10943            for annotation_field in set(annotation_fields):
10944                query_merge_on_transcripts_annotation_fields.append(
10945                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10946                )
10947
10948            # Transcripts mapping
10949            if transcript_id_mapping_file:
10950
10951                # Transcript dataframe
10952                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10953                transcript_id_mapping_dataframe = transcripts_file_to_df(
10954                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10955                )
10956
10957                # Transcript version remove
10958                if transcript_id_remove_version:
10959                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10960                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10961                    query_left_join = f"""
10962                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10963                    """
10964                else:
10965                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10966                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10967                    query_left_join = f"""
10968                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10969                    """
10970
10971                # Transcript column for group by merge
10972                query_transcript_merge_group_by = """
10973                        CASE
10974                            WHEN transcript_mapped NOT IN ('')
10975                            THEN split_part(transcript_mapped, '.', 1)
10976                            ELSE split_part(transcript_original, '.', 1)
10977                        END
10978                    """
10979
10980                # Merge query
10981                transcripts_tmp2_query = f"""
10982                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10983                    FROM ({query_merge}) AS {transcript_table_tmp}
10984                    {query_left_join}
10985                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10986                """
10987
10988                # Retrive columns after mege
10989                transcripts_tmp2_describe_query = f"""
10990                    DESCRIBE {transcripts_tmp2_query}
10991                """
10992                transcripts_tmp2_describe_list = list(
10993                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10994                        "column_name"
10995                    ]
10996                )
10997
10998                # Create list of columns for select clause
10999                transcripts_tmp2_describe_select_clause = []
11000                for field in transcripts_tmp2_describe_list:
11001                    if field not in [
11002                        "#CHROM",
11003                        "POS",
11004                        "REF",
11005                        "ALT",
11006                        "INFO",
11007                        "transcript_mapped",
11008                    ]:
11009                        as_field = field
11010                        if field in ["transcript_original"]:
11011                            as_field = "transcripts_mapped"
11012                        transcripts_tmp2_describe_select_clause.append(
11013                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11014                        )
11015
11016                # Merge with mapping
11017                query_merge_on_transcripts = f"""
11018                    SELECT
11019                        "#CHROM", POS, REF, ALT, INFO,
11020                        CASE
11021                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11022                            THEN ANY_VALUE(transcript_mapped)
11023                            ELSE ANY_VALUE(transcript_original)
11024                        END AS transcript,
11025                        {", ".join(transcripts_tmp2_describe_select_clause)}
11026                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11027                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11028                        {query_transcript_merge_group_by}
11029                """
11030
11031                # Add transcript filter from mapping file
11032                if transcript_id_mapping_force:
11033                    query_merge_on_transcripts = f"""
11034                        SELECT *
11035                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11036                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11037                    """
11038
11039            # No transcript mapping
11040            else:
11041
11042                # Remove transcript version
11043                if transcript_id_remove_version:
11044                    query_transcript_column = f"""
11045                        split_part({transcript_table_tmp}.transcript, '.', 1)
11046                    """
11047                else:
11048                    query_transcript_column = """
11049                        transcript
11050                    """
11051
11052                # Query sections
11053                query_transcript_column_select = (
11054                    f"{query_transcript_column} AS transcript"
11055                )
11056                query_transcript_column_group_by = query_transcript_column
11057
11058                # Query for transcripts view
11059                query_merge_on_transcripts = f"""
11060                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11061                    FROM ({query_merge}) AS {transcript_table_tmp}
11062                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11063                """
11064
11065            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11066
11067            # Drop transcript view is necessary
11068            if transcripts_table_drop:
11069                query_drop = f"""
11070                    DROP TABLE IF EXISTS {transcripts_table};
11071                """
11072                self.execute_query(query=query_drop)
11073
11074            # Merge and create transcript view
11075            query_create_view = f"""
11076                CREATE TABLE IF NOT EXISTS {transcripts_table}
11077                AS {query_merge_on_transcripts}
11078            """
11079            self.execute_query(query=query_create_view)
11080
11081            # Remove added columns
11082            for added_column in added_columns:
11083                self.drop_column(column=added_column)
11084
11085        else:
11086
11087            transcripts_table = None
11088
11089        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to False
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts', column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> str:
11091    def annotation_format_to_table(
11092        self,
11093        uniquify: bool = True,
11094        annotation_field: str = "ANN",
11095        annotation_id: str = "Feature_ID",
11096        view_name: str = "transcripts",
11097        column_rename: dict = {},
11098        column_clean: bool = False,
11099        column_case: str = None,
11100    ) -> str:
11101        """
11102        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11103        structured table format, ensuring unique values and creating a temporary table for further
11104        processing or analysis.
11105
11106        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11107        unique values in the output or not. If set to `True`, the function will make sure that the
11108        output values are unique, defaults to True
11109        :type uniquify: bool (optional)
11110        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11111        that contains the annotation information for each variant. This field is used to extract the
11112        annotation details for further processing in the function. By default, it is set to "ANN",
11113        defaults to ANN
11114        :type annotation_field: str (optional)
11115        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11116        is used to specify the identifier for the annotation feature. This identifier will be used as a
11117        column name in the resulting table or view that is created based on the annotation data. It
11118        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11119        :type annotation_id: str (optional)
11120        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11121        to specify the name of the temporary table that will be created to store the transformed
11122        annotation data. This table will hold the extracted information from the annotation field in a
11123        structured format for further processing or analysis. By default,, defaults to transcripts
11124        :type view_name: str (optional)
11125        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11126        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11127        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11128        created based on the annotation data. This feature enables
11129        :type column_rename: dict
11130        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11131        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11132        If set to `True`, the function will clean the annotation field before further processing. This
11133        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11134        to False
11135        :type column_clean: bool (optional)
11136        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11137        used to specify the case transformation to be applied to the column names extracted from the
11138        annotation data. It allows you to set the case of the column names to either lowercase or
11139        uppercase for consistency or other specific requirements during the conversion
11140        :type column_case: str
11141        :return: The function `annotation_format_to_table` is returning the name of the view created,
11142        which is stored in the variable `view_name`.
11143        """
11144
11145        # Annotation field
11146        annotation_format = "annotation_explode"
11147
11148        # Transcript annotation
11149        if column_rename:
11150            annotation_id = column_rename.get(annotation_id, annotation_id)
11151
11152        if column_clean:
11153            annotation_id = clean_annotation_field(annotation_id)
11154
11155        # Prefix
11156        prefix = self.get_explode_infos_prefix()
11157        if prefix:
11158            prefix = "INFO/"
11159
11160        # Annotation fields
11161        annotation_infos = prefix + annotation_field
11162        annotation_format_infos = prefix + annotation_format
11163
11164        # Variants table
11165        table_variants = self.get_table_variants()
11166
11167        # Header
11168        vcf_reader = self.get_header()
11169
11170        # Add columns
11171        added_columns = []
11172
11173        # Explode HGVS field in column
11174        added_columns += self.explode_infos(fields=[annotation_field])
11175
11176        if annotation_field in vcf_reader.infos:
11177
11178            # Extract ANN header
11179            ann_description = vcf_reader.infos[annotation_field].desc
11180            pattern = r"'(.+?)'"
11181            match = re.search(pattern, ann_description)
11182            if match:
11183                ann_header_match = match.group(1).split(" | ")
11184                ann_header = []
11185                ann_header_desc = {}
11186                for i in range(len(ann_header_match)):
11187                    ann_header_info = "".join(
11188                        char for char in ann_header_match[i] if char.isalnum()
11189                    )
11190                    ann_header.append(ann_header_info)
11191                    ann_header_desc[ann_header_info] = ann_header_match[i]
11192                if not ann_header_desc:
11193                    raise ValueError("Invalid header description format")
11194            else:
11195                raise ValueError("Invalid header description format")
11196
11197            # Create variant id
11198            variant_id_column = self.get_variant_id_column()
11199            added_columns += [variant_id_column]
11200
11201            # Create dataframe
11202            dataframe_annotation_format = self.get_query_to_df(
11203                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11204            )
11205
11206            # Create annotation columns
11207            dataframe_annotation_format[
11208                annotation_format_infos
11209            ] = dataframe_annotation_format[annotation_infos].apply(
11210                lambda x: explode_annotation_format(
11211                    annotation=str(x),
11212                    uniquify=uniquify,
11213                    output_format="JSON",
11214                    prefix="",
11215                    header=list(ann_header_desc.values()),
11216                )
11217            )
11218
11219            # Find keys
11220            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11221            df_keys = self.get_query_to_df(query=query_json)
11222
11223            # Check keys
11224            query_json_key = []
11225            for _, row in df_keys.iterrows():
11226
11227                # Key
11228                key = row.iloc[0]
11229                key_clean = key
11230
11231                # key rename
11232                if column_rename:
11233                    key_clean = column_rename.get(key_clean, key_clean)
11234
11235                # key clean
11236                if column_clean:
11237                    key_clean = clean_annotation_field(key_clean)
11238
11239                # Key case
11240                if column_case:
11241                    if column_case.lower() in ["lower"]:
11242                        key_clean = key_clean.lower()
11243                    elif column_case.lower() in ["upper"]:
11244                        key_clean = key_clean.upper()
11245
11246                # Type
11247                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11248
11249                # Get DataFrame from query
11250                df_json_type = self.get_query_to_df(query=query_json_type)
11251
11252                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11253                with pd.option_context("future.no_silent_downcasting", True):
11254                    df_json_type.fillna(value="", inplace=True)
11255                    replace_dict = {None: np.nan, "": np.nan}
11256                    df_json_type.replace(replace_dict, inplace=True)
11257                    df_json_type.dropna(inplace=True)
11258
11259                # Detect column type
11260                column_type = detect_column_type(df_json_type[key_clean])
11261
11262                # Append
11263                query_json_key.append(
11264                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11265                )
11266
11267            # Create view
11268            query_view = f"""
11269                CREATE TEMPORARY TABLE {view_name}
11270                AS (
11271                    SELECT *, {annotation_id} AS 'transcript'
11272                    FROM (
11273                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11274                        FROM dataframe_annotation_format
11275                        )
11276                    );
11277            """
11278            self.execute_query(query=query_view)
11279
11280        else:
11281
11282            # Return None
11283            view_name = None
11284
11285        # Remove added columns
11286        for added_column in added_columns:
11287            self.drop_column(column=added_column)
11288
11289        return view_name

The annotation_format_to_table function converts annotation data from a VCF file into a structured table format, ensuring unique values and creating a temporary table for further processing or analysis.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts
  • column_rename: The column_rename parameter in the annotation_format_to_table method is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables
  • column_clean: The column_clean parameter in the annotation_format_to_table method is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set to True, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False
  • column_case: The column_case parameter in the annotation_format_to_table method is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field_json: str = None, transcripts_info_format: str = None, transcripts_info_field_format: str = None, param: dict = {}) -> bool:
11291    def transcript_view_to_variants(
11292        self,
11293        transcripts_table: str = None,
11294        transcripts_column_id: str = None,
11295        transcripts_info_json: str = None,
11296        transcripts_info_field_json: str = None,
11297        transcripts_info_format: str = None,
11298        transcripts_info_field_format: str = None,
11299        param: dict = {},
11300    ) -> bool:
11301        """
11302        The `transcript_view_to_variants` function updates a variants table with information from
11303        transcripts in JSON format.
11304
11305        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11306        table containing the transcripts data. If this parameter is not provided, the function will
11307        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11308        :type transcripts_table: str
11309        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11310        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11311        identifier is used to match transcripts with variants in the database
11312        :type transcripts_column_id: str
11313        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11314        of the column in the variants table where the transcripts information will be stored in JSON
11315        format. This parameter allows you to define the column in the variants table that will hold the
11316        JSON-formatted information about transcripts
11317        :type transcripts_info_json: str
11318        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11319        specify the field in the VCF header that will contain information about transcripts in JSON
11320        format. This field will be added to the VCF header as an INFO field with the specified name
11321        :type transcripts_info_field_json: str
11322        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11323        format of the information about transcripts that will be stored in the variants table. This
11324        format can be used to define how the transcript information will be structured or displayed
11325        within the variants table
11326        :type transcripts_info_format: str
11327        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11328        specify the field in the VCF header that will contain information about transcripts in a
11329        specific format. This field will be added to the VCF header as an INFO field with the specified
11330        name
11331        :type transcripts_info_field_format: str
11332        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11333        that contains various configuration settings related to transcripts. It is used to provide
11334        default values for certain parameters if they are not explicitly provided when calling the
11335        method. The `param` dictionary can be passed as an argument
11336        :type param: dict
11337        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11338        if the operation is successful and `False` if certain conditions are not met.
11339        """
11340
11341        msg_info_prefix = "Start transcripts view to variants annotations"
11342
11343        log.debug(f"{msg_info_prefix}...")
11344
11345        # Default
11346        transcripts_table_default = "transcripts"
11347        transcripts_column_id_default = "transcript"
11348        transcripts_info_json_default = None
11349        transcripts_info_format_default = None
11350        transcripts_info_field_json_default = None
11351        transcripts_info_field_format_default = None
11352
11353        # Param
11354        if not param:
11355            param = self.get_param()
11356
11357        # Transcripts table
11358        if transcripts_table is None:
11359            transcripts_table = param.get("transcripts", {}).get(
11360                "table", transcripts_table_default
11361            )
11362
11363        # Transcripts column ID
11364        if transcripts_column_id is None:
11365            transcripts_column_id = param.get("transcripts", {}).get(
11366                "column_id", transcripts_column_id_default
11367            )
11368
11369        # Transcripts info json
11370        if transcripts_info_json is None:
11371            transcripts_info_json = param.get("transcripts", {}).get(
11372                "transcripts_info_json", transcripts_info_json_default
11373            )
11374
11375        # Transcripts info field JSON
11376        if transcripts_info_field_json is None:
11377            transcripts_info_field_json = param.get("transcripts", {}).get(
11378                "transcripts_info_field_json", transcripts_info_field_json_default
11379            )
11380        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11381        #     transcripts_info_json = transcripts_info_field_json
11382
11383        # Transcripts info format
11384        if transcripts_info_format is None:
11385            transcripts_info_format = param.get("transcripts", {}).get(
11386                "transcripts_info_format", transcripts_info_format_default
11387            )
11388
11389        # Transcripts info field FORMAT
11390        if transcripts_info_field_format is None:
11391            transcripts_info_field_format = param.get("transcripts", {}).get(
11392                "transcripts_info_field_format", transcripts_info_field_format_default
11393            )
11394        # if (
11395        #     transcripts_info_field_format is not None
11396        #     and transcripts_info_format is None
11397        # ):
11398        #     transcripts_info_format = transcripts_info_field_format
11399
11400        # Variants table
11401        table_variants = self.get_table_variants()
11402
11403        # Check info columns param
11404        if (
11405            transcripts_info_json is None
11406            and transcripts_info_field_json is None
11407            and transcripts_info_format is None
11408            and transcripts_info_field_format is None
11409        ):
11410            return False
11411
11412        # Transcripts infos columns
11413        query_transcripts_infos_columns = f"""
11414            SELECT *
11415            FROM (
11416                DESCRIBE SELECT * FROM {transcripts_table}
11417                )
11418            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11419        """
11420        transcripts_infos_columns = list(
11421            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11422        )
11423
11424        # View results
11425        clause_select = []
11426        clause_to_json = []
11427        clause_to_format = []
11428        for field in transcripts_infos_columns:
11429            # Do not consider INFO field for export into fields
11430            if field not in ["INFO"]:
11431                clause_select.append(
11432                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11433                )
11434                clause_to_json.append(f""" '{field}': "{field}" """)
11435                clause_to_format.append(f""" "{field}" """)
11436
11437        # Update
11438        update_set_json = []
11439        update_set_format = []
11440
11441        # VCF header
11442        vcf_reader = self.get_header()
11443
11444        # Transcripts to info column in JSON
11445        if transcripts_info_json:
11446
11447            # Create column on variants table
11448            self.add_column(
11449                table_name=table_variants,
11450                column_name=transcripts_info_json,
11451                column_type="JSON",
11452                default_value=None,
11453                drop=False,
11454            )
11455
11456            # Add header
11457            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11458                transcripts_info_json,
11459                ".",
11460                "String",
11461                "Transcripts in JSON format",
11462                "unknwon",
11463                "unknwon",
11464                self.code_type_map["String"],
11465            )
11466
11467            # Add to update
11468            update_set_json.append(
11469                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11470            )
11471
11472        # Transcripts to info field in JSON
11473        if transcripts_info_field_json:
11474
11475            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11476
11477            # Add to update
11478            update_set_json.append(
11479                f""" 
11480                    INFO = concat(
11481                            CASE
11482                                WHEN INFO NOT IN ('', '.')
11483                                THEN INFO
11484                                ELSE ''
11485                            END,
11486                            CASE
11487                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11488                                THEN concat(
11489                                    ';{transcripts_info_field_json}=',
11490                                    t.{transcripts_info_json}
11491                                )
11492                                ELSE ''
11493                            END
11494                            )
11495                """
11496            )
11497
11498            # Add header
11499            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11500                transcripts_info_field_json,
11501                ".",
11502                "String",
11503                "Transcripts in JSON format",
11504                "unknwon",
11505                "unknwon",
11506                self.code_type_map["String"],
11507            )
11508
11509        if update_set_json:
11510
11511            # Update query
11512            query_update = f"""
11513                UPDATE {table_variants}
11514                    SET {", ".join(update_set_json)}
11515                FROM
11516                (
11517                    SELECT
11518                        "#CHROM", POS, REF, ALT,
11519                            concat(
11520                            '{{',
11521                            string_agg(
11522                                '"' || "{transcripts_column_id}" || '":' ||
11523                                to_json(json_output)
11524                            ),
11525                            '}}'
11526                            )::JSON AS {transcripts_info_json}
11527                    FROM
11528                        (
11529                        SELECT
11530                            "#CHROM", POS, REF, ALT,
11531                            "{transcripts_column_id}",
11532                            to_json(
11533                                {{{",".join(clause_to_json)}}}
11534                            )::JSON AS json_output
11535                        FROM
11536                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11537                        WHERE "{transcripts_column_id}" IS NOT NULL
11538                        )
11539                    GROUP BY "#CHROM", POS, REF, ALT
11540                ) AS t
11541                WHERE {table_variants}."#CHROM" = t."#CHROM"
11542                    AND {table_variants}."POS" = t."POS"
11543                    AND {table_variants}."REF" = t."REF"
11544                    AND {table_variants}."ALT" = t."ALT"
11545            """
11546
11547            self.execute_query(query=query_update)
11548
11549        # Transcripts to info column in FORMAT
11550        if transcripts_info_format:
11551
11552            # Create column on variants table
11553            self.add_column(
11554                table_name=table_variants,
11555                column_name=transcripts_info_format,
11556                column_type="VARCHAR",
11557                default_value=None,
11558                drop=False,
11559            )
11560
11561            # Add header
11562            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11563                transcripts_info_format,
11564                ".",
11565                "String",
11566                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11567                "unknwon",
11568                "unknwon",
11569                self.code_type_map["String"],
11570            )
11571
11572            # Add to update
11573            update_set_format.append(
11574                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11575            )
11576
11577        else:
11578
11579            # Set variable for internal queries
11580            transcripts_info_format = "transcripts_info_format"
11581
11582        # Transcripts to info field in JSON
11583        if transcripts_info_field_format:
11584
11585            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11586
11587            # Add to update
11588            update_set_format.append(
11589                f""" 
11590                    INFO = concat(
11591                            CASE
11592                                WHEN INFO NOT IN ('', '.')
11593                                THEN INFO
11594                                ELSE ''
11595                            END,
11596                            CASE
11597                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11598                                THEN concat(
11599                                    ';{transcripts_info_field_format}=',
11600                                    t.{transcripts_info_format}
11601                                )
11602                                ELSE ''
11603                            END
11604                            )
11605                """
11606            )
11607
11608            # Add header
11609            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11610                transcripts_info_field_format,
11611                ".",
11612                "String",
11613                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11614                "unknwon",
11615                "unknwon",
11616                self.code_type_map["String"],
11617            )
11618
11619        if update_set_format:
11620
11621            # Update query
11622            query_update = f"""
11623                UPDATE {table_variants}
11624                    SET {", ".join(update_set_format)}
11625                FROM
11626                (
11627                    SELECT
11628                        "#CHROM", POS, REF, ALT,
11629                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11630                    FROM 
11631                        (
11632                        SELECT
11633                            "#CHROM", POS, REF, ALT,
11634                            "{transcripts_column_id}",
11635                            concat(
11636                                "{transcripts_column_id}",
11637                                '|',
11638                                {", '|', ".join(clause_to_format)}
11639                            ) AS {transcripts_info_format}
11640                        FROM
11641                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11642                        )
11643                    GROUP BY "#CHROM", POS, REF, ALT
11644                ) AS t
11645                WHERE {table_variants}."#CHROM" = t."#CHROM"
11646                    AND {table_variants}."POS" = t."POS"
11647                    AND {table_variants}."REF" = t."REF"
11648                    AND {table_variants}."ALT" = t."ALT"
11649            """
11650
11651            self.execute_query(query=query_update)
11652
11653        return True

The transcript_view_to_variants function updates a variants table with information from transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts
  • transcripts_info_field_json: The transcripts_info_field_json parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • transcripts_info_format: The transcripts_info_format parameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table
  • transcripts_info_field_format: The transcripts_info_field_format parameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The param parameter in the transcript_view_to_variants method is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. The param dictionary can be passed as an argument
Returns

The function transcript_view_to_variants returns a boolean value. It returns True if the operation is successful and False if certain conditions are not met.

def rename_info_fields(self, fields_to_rename: dict = None, table: str = None) -> dict:
11655    def rename_info_fields(
11656        self, fields_to_rename: dict = None, table: str = None
11657    ) -> dict:
11658        """
11659        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11660        corresponding INFO fields in the variants table.
11661
11662        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11663        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11664        represent the original field names that need to be renamed, and the corresponding values
11665        represent the new names to which the fields should be
11666        :type fields_to_rename: dict
11667        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11668        the table in which the variants data is stored. This table contains information about genetic
11669        variants, and the function updates the corresponding INFO fields in this table when renaming
11670        specified fields in the VCF file header
11671        :type table: str
11672        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11673        the original field names as keys and their corresponding new names (or None if the field was
11674        removed) as values after renaming or removing specified fields in a VCF file header and updating
11675        corresponding INFO fields in the variants table.
11676        """
11677
11678        # Init
11679        fields_renamed = {}
11680        config = self.get_config()
11681        access = config.get("access")
11682
11683        if table is None:
11684            table = self.get_table_variants()
11685
11686        # regexp replace fonction
11687        regex_replace_dict = {}
11688        regex_replace_nb = 0
11689        regex_replace_partition = 125
11690        regex_replace = "INFO"
11691
11692        if fields_to_rename is not None and access not in ["RO"]:
11693
11694            log.info("Rename or remove fields...")
11695
11696            # Header
11697            header = self.get_header()
11698
11699            for field_to_rename, field_renamed in fields_to_rename.items():
11700
11701                if field_to_rename in header.infos:
11702
11703                    # Rename header
11704                    if field_renamed is not None:
11705                        header.infos[field_renamed] = vcf.parser._Info(
11706                            field_renamed,
11707                            header.infos[field_to_rename].num,
11708                            header.infos[field_to_rename].type,
11709                            header.infos[field_to_rename].desc,
11710                            header.infos[field_to_rename].source,
11711                            header.infos[field_to_rename].version,
11712                            header.infos[field_to_rename].type_code,
11713                        )
11714                    del header.infos[field_to_rename]
11715
11716                    # Rename INFO patterns
11717                    field_pattern = rf'(^|;)({field_to_rename})($|;|=[^;]*)'
11718                    if field_renamed is not None:
11719                        field_renamed_pattern = rf'\1{field_renamed}\3'
11720                    else:
11721                        field_renamed_pattern = ''
11722
11723                    # regexp replace
11724                    regex_replace_nb += 1
11725                    regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition)
11726                    if (regex_replace_nb % regex_replace_partition) == 0:
11727                        regex_replace = "INFO"
11728                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11729                    regex_replace_dict[regex_replace_key] = regex_replace
11730
11731                    # Return
11732                    fields_renamed[field_to_rename] = field_renamed
11733
11734                    # Log
11735                    if field_renamed is not None:
11736                        log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'")
11737                    else:
11738                        log.info(f"Rename or remove fields - field '{field_to_rename}' removed")
11739
11740                else:
11741
11742                    log.warning(f"Rename or remove fields - field '{field_to_rename}' not in header")
11743
11744
11745            # Rename INFO
11746            for regex_replace_key, regex_replace  in regex_replace_dict.items():
11747                log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...")
11748                query = f"""
11749                    UPDATE {table}
11750                    SET
11751                        INFO = {regex_replace}
11752                """
11753                log.debug(f"query={query}")
11754                self.execute_query(query=query)
11755
11756        return fields_renamed

The rename_info_fields function renames specified fields in a VCF file header and updates corresponding INFO fields in the variants table.

Parameters
  • fields_to_rename: The fields_to_rename parameter is a dictionary that contains the mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be
  • table: The table parameter in the rename_info_fields function represents the name of the table in which the variants data is stored. This table contains information about genetic variants, and the function updates the corresponding INFO fields in this table when renaming specified fields in the VCF file header
Returns

The rename_info_fields function returns a dictionary fields_renamed that contains the original field names as keys and their corresponding new names (or None if the field was removed) as values after renaming or removing specified fields in a VCF file header and updating corresponding INFO fields in the variants table.

def calculation_rename_info_fields( self, fields_to_rename: dict = None, table: str = None, operation_name: str = 'RENAME_INFO_FIELDS') -> None:
11758    def calculation_rename_info_fields(
11759        self,
11760        fields_to_rename: dict = None,
11761        table: str = None,
11762        operation_name: str = "RENAME_INFO_FIELDS",
11763    ) -> None:
11764        """
11765        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11766        fields to rename and table if provided, and then calls another function to rename the fields.
11767
11768        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11769        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11770        the key and the new field name as the value
11771        :type fields_to_rename: dict
11772        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11773        specify the name of the table for which the fields are to be renamed. It is a string type
11774        parameter
11775        :type table: str
11776        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11777        method is a string that specifies the name of the operation being performed. In this context, it
11778        is used as a default value for the operation name if not explicitly provided when calling the
11779        function, defaults to RENAME_INFO_FIELDS
11780        :type operation_name: str (optional)
11781        """
11782
11783        # Param
11784        param = self.get_param()
11785
11786        # Get param fields to rename
11787        param_fields_to_rename = (
11788            param.get("calculation", {})
11789            .get("calculations", {})
11790            .get(operation_name, {})
11791            .get("fields_to_rename", None)
11792        )
11793
11794        # Get param table
11795        param_table = (
11796            param.get("calculation", {})
11797            .get("calculations", {})
11798            .get(operation_name, {})
11799            .get("table", None)
11800        )
11801
11802        # Init fields_to_rename
11803        if fields_to_rename is None:
11804            fields_to_rename = param_fields_to_rename
11805
11806        # Init table
11807        if table is None:
11808            table = param_table
11809
11810        renamed_fields = self.rename_info_fields(
11811            fields_to_rename=fields_to_rename, table=table
11812        )
11813
11814        log.debug(f"renamed_fields:{renamed_fields}")

The calculation_rename_info_fields function retrieves parameters from a dictionary, updates fields to rename and table if provided, and then calls another function to rename the fields.

Parameters
  • fields_to_rename: fields_to_rename is a dictionary that contains the fields to be renamed in a table. Each key-value pair in the dictionary represents the original field name as the key and the new field name as the value
  • table: The table parameter in the calculation_rename_info_fields method is used to specify the name of the table for which the fields are to be renamed. It is a string type parameter
  • operation_name: The operation_name parameter in the calculation_rename_info_fields method is a string that specifies the name of the operation being performed. In this context, it is used as a default value for the operation name if not explicitly provided when calling the function, defaults to RENAME_INFO_FIELDS